class TestPeptide(TestCase): def setUp(self): self.simple = Peptide("SYFPEITHI") self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK" self.gcg_t1 = Transcript("", transcript_id="GLUC_HUMAN") gcg_p1 = Protein(self.gcg_ps, transcript_id='GLUC_HUMAN', orig_transcript=self.gcg_t1) self.w_p = Peptide("PROTEIN", {gcg_p1: [0]}) self.gcg_p1 = gcg_p1 self.gcg_v1 = Variant( "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', { "GLUC_HUMAN": MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D") }, False, False) gcg_p1_copy = copy.deepcopy(gcg_p1) gcg_p1_copy.vars = {0: [self.gcg_v1]} self.w_v = Peptide("VARIANT", {gcg_p1_copy: [0]}) #self.gcg_ts = "gcatagaatgcagatgagcaaagtgagtgggagagggaagtcatttgtaacaaaaactcattatttacagatgagaaatttatattgtcagcgtaatatctgtgaggctaaacagagctggagagtatataaaagcagtgcgccttggtgcagaagtacagagcttaggacacagagcacatcaaaagttcccaaagagggcttgctctctcttcacctgctctgttctacagcacactaccagaagacagcagaaatgaaaagcatttactttgtggctggattatttgtaatgctggtacaaggcagctggcaacgttcccttcaagacacagaggagaaatccagatcattctcagcttcccaggcagacccactcagtgatcctgatcagatgaacgaggacaagcgccattcacagggcacattcaccagtgactacagcaagtatctggactccaggcgtgcccaagattttgtgcagtggttgatgaataccaagaggaacaggaataacattgccaaacgtcacgatgaatttgagagacatgctgaagggacctttaccagtgatgtaagttcttatttggaaggccaagctgccaaggaattcattgcttggctggtgaaaggccgaggaaggcgagatttcccagaagaggtcgccattgttgaagaacttggccgcagacatgctgatggttctttctctgatgagatgaacaccattcttgataatcttgccgccagggactttataaactggttgattcagaccaaaatcactgacaggaaataactatatcactattcaagatcatcttcacaacatcacctgctagccacgtgggatgtttgaaatgttaagtcctgtaaatttaagaggtgtattctgaggccacattgctttgcatgccaataaataaattttcttttagtgttgtgtagccaaaaattacaaatggaataaagttttatcaaaatattgctaaaatatcagctttaaaatatgaaagtgctagattctgttattttcttcttattttggatgaagtaccccaacctgtttacatttagcgataaaattatttttctatgatataatttgtaaatgtaaattattccgatctgacatatctgcattataataataggagaatagaagaactggtagccacagtggtgaaattggaaagagaactttcttcctgaaacctttgtcttaaaaatactcagctttcaatgtatcaaagatacaattaaataaaattttcaagcttctttaccattgtct" ##gcg_t1 = Transcript(gcg_ts, "NM_002054.4", {344: gcg_v1}) #gcg_t1 = Transcript(self.gcg_ts, 'GLUC_HUMAN', "NM_002054.4", [self.gcg_v1]) #self.w_t = Peptide("TRANSCRIPT", {gcg_p1: [0]}) def test_consistency(self): """ tests all __*__ (including init) test has several asserts! If one fails, the following will not be evaluated! """ self.assertTrue(repr(self.simple) == "PEPTIDE:\n SYFPEITHI") self.assertTrue( repr(self.w_p) == "PEPTIDE:\n PROTEIN\nin TRANSCRIPT: GLUC_HUMAN\n\tVARIANTS:\nin PROTEIN: GLUC_HUMAN" ) self.assertTrue( repr(self.w_v) == "PEPTIDE:\n VARIANT\nin TRANSCRIPT: GLUC_HUMAN\n\tVARIANTS:\n\tVariant(g.162145588G>T)\nin PROTEIN: GLUC_HUMAN" ) def test_getitem(self): self.assertTrue(self.simple[1:3] == 'YF') #TODO: document to have variant peptides from Protein with Variants use Generator def test_get_all_variants(self): self.assertTrue( repr(self.w_v.get_variants_by_protein("GLUC_HUMAN")) == repr( [self.gcg_v1])) def test_get_all_proteins(self): self.assertTrue(repr(self.simple.get_all_proteins()) == repr([])) self.assertTrue( repr(self.w_p.get_all_proteins()) == repr([self.gcg_p1])) def test_get_all_transcripts(self): self.assertTrue( repr(self.w_v.get_all_transcripts()) == repr( [Transcript(seq="", transcript_id="GLUC_HUMAN")])) self.assertTrue( repr(self.w_p.get_all_transcripts()) == repr([self.gcg_t1]))
def setUp(self): self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")] self.peptides_mhcII = [Peptide("AAAAAASYFPEITHI"), Peptide("IHTIEPFYSAAAAAA")] self.mhcI = [Allele("HLA-B*07:02"), Allele("HLA-A*02:01")] self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")] self.mhcII_combined_alleles = [CombinedAllele("DPA1*01:03-DPB1*01:01"), CombinedAllele("DQA1*06:02-DQB1*06:31")] self.transcript = Transcript("")
def setUp(self): #Peptides of different length 9,10,11,12,13,14,15 self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")] self.peptides_mhcII = [ Peptide("SYFPEITHI"), Peptide("IHTIEPFYSAAAAAA") ] self.mhcI = [Allele("HLA-B*15:01"), Allele("HLA-A*02:01")] self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")]
def test_simple_assembly(self): """ Simple test if everything works. Solution manually tested for optimality. :return: """ pred = CleavageSitePredictorFactory("PCM") assembler = EpitopeAssembly(self.peptides, pred, solver="cbc", verbosity=0) r = assembler.solve() self.assertEqual(r, [Peptide("YLYDHLAPM"), Peptide("ALYDVVSTL"), Peptide("KLLPRLPGV")])
def setUp(self): self.simple = Peptide("SYFPEITHI") self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK" self.gcg_t1 = Transcript("", _transcript_id="GLUC_HUMAN") gcg_p1 = Protein(self.gcg_ps, _transcript_id='GLUC_HUMAN', _orig_transcript=self.gcg_t1) self.w_p = Peptide("PROTEIN", {gcg_p1:[0]}) self.gcg_p1 = gcg_p1 self.gcg_v1 = Variant("rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', {"GLUC_HUMAN": MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D")}, False, False) gcg_p1_copy = copy.deepcopy(gcg_p1) gcg_p1_copy.vars = {0:[self.gcg_v1]} self.w_v = Peptide("VARIANT", {gcg_p1_copy:[0]})
def compute_affinities(input_alleles, input_peptides, output_affinities, processes, predictor): ''' Computes the binding affinities between the given peptides and HLA alleles ''' alleles = [ Allele(a.replace('HLA-', '')) for a in utilities.get_alleles_and_thresholds(input_alleles).index ] LOGGER.info('Loaded %d alleles', len(alleles)) with open(input_peptides) as f: reader = csv.DictReader(f) peptides = [(Peptide(r['peptide']), len(r['proteins'].split(';'))) for r in reader] peptides.sort(key=lambda p: p[1], reverse=True) LOGGER.info('Loaded %d peptides', len(peptides)) results = utilities.parallel_apply( get_binding_affinity_process, ((predictor.lower(), batch, alleles) for batch in utilities.batches((p for p, _ in peptides), bsize=256)), processes) count = 0 for bindings in results: bindings.to_csv(output_affinities, header=(count == 0), mode=('w' if count == 0 else 'a')) count += len(bindings) LOGGER.debug('Processed %d peptides (%.2f%%)...', count, 100 * count / len(peptides))
class TestPeptide(TestCase): def setUp(self): self.simple = Peptide("SYFPEITHI") self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK" self.gcg_t1 = Transcript("", _transcript_id="GLUC_HUMAN") gcg_p1 = Protein(self.gcg_ps, _transcript_id='GLUC_HUMAN', _orig_transcript=self.gcg_t1) self.w_p = Peptide("PROTEIN", {gcg_p1:[0]}) self.gcg_p1 = gcg_p1 self.gcg_v1 = Variant("rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', {"GLUC_HUMAN": MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D")}, False, False) gcg_p1_copy = copy.deepcopy(gcg_p1) gcg_p1_copy.vars = {0:[self.gcg_v1]} self.w_v = Peptide("VARIANT", {gcg_p1_copy:[0]}) #self.gcg_ts = "gcatagaatgcagatgagcaaagtgagtgggagagggaagtcatttgtaacaaaaactcattatttacagatgagaaatttatattgtcagcgtaatatctgtgaggctaaacagagctggagagtatataaaagcagtgcgccttggtgcagaagtacagagcttaggacacagagcacatcaaaagttcccaaagagggcttgctctctcttcacctgctctgttctacagcacactaccagaagacagcagaaatgaaaagcatttactttgtggctggattatttgtaatgctggtacaaggcagctggcaacgttcccttcaagacacagaggagaaatccagatcattctcagcttcccaggcagacccactcagtgatcctgatcagatgaacgaggacaagcgccattcacagggcacattcaccagtgactacagcaagtatctggactccaggcgtgcccaagattttgtgcagtggttgatgaataccaagaggaacaggaataacattgccaaacgtcacgatgaatttgagagacatgctgaagggacctttaccagtgatgtaagttcttatttggaaggccaagctgccaaggaattcattgcttggctggtgaaaggccgaggaaggcgagatttcccagaagaggtcgccattgttgaagaacttggccgcagacatgctgatggttctttctctgatgagatgaacaccattcttgataatcttgccgccagggactttataaactggttgattcagaccaaaatcactgacaggaaataactatatcactattcaagatcatcttcacaacatcacctgctagccacgtgggatgtttgaaatgttaagtcctgtaaatttaagaggtgtattctgaggccacattgctttgcatgccaataaataaattttcttttagtgttgtgtagccaaaaattacaaatggaataaagttttatcaaaatattgctaaaatatcagctttaaaatatgaaagtgctagattctgttattttcttcttattttggatgaagtaccccaacctgtttacatttagcgataaaattatttttctatgatataatttgtaaatgtaaattattccgatctgacatatctgcattataataataggagaatagaagaactggtagccacagtggtgaaattggaaagagaactttcttcctgaaacctttgtcttaaaaatactcagctttcaatgtatcaaagatacaattaaataaaattttcaagcttctttaccattgtct" ##gcg_t1 = Transcript(gcg_ts, "NM_002054.4", {344: gcg_v1}) #gcg_t1 = Transcript(self.gcg_ts, 'GLUC_HUMAN', "NM_002054.4", [self.gcg_v1]) #self.w_t = Peptide("TRANSCRIPT", {gcg_p1: [0]}) def test_consistency(self): """ tests all __*__ (including init) test has several asserts! If one fails, the following will not be evaluated! """ self.assertTrue(repr(self.simple) == "PEPTIDE:\n SYFPEITHI") self.assertTrue(repr(self.w_p) == "PEPTIDE:\n PROTEIN\nin TRANSCRIPT: GLUC_HUMAN\n\tVARIANTS:\nin PROTEIN: GLUC_HUMAN") self.assertTrue(repr(self.w_v) == "PEPTIDE:\n VARIANT\nin TRANSCRIPT: GLUC_HUMAN\n\tVARIANTS:\n\tVariant(g.162145588G>T)\nin PROTEIN: GLUC_HUMAN") def test_getitem(self): self.assertTrue(self.simple[1:3] == 'YF') #TODO: document to have variant peptides from Protein with Variants use Generator def test_get_all_variants(self): self.assertTrue(repr(self.w_v.get_variants_by_protein("GLUC_HUMAN")) == repr([self.gcg_v1])) def test_get_all_proteins(self): self.assertTrue(repr(self.simple.get_all_proteins()) == repr([])) self.assertTrue(repr(self.w_p.get_all_proteins()) == repr([self.gcg_p1])) def test_get_all_transcripts(self): self.assertTrue(repr(self.w_v.get_all_transcripts()) == repr([Transcript(_seq="", _transcript_id="GLUC_HUMAN")])) self.assertTrue(repr(self.w_p.get_all_transcripts()) == repr([self.gcg_t1]))
def read_lines(file): peptides = [] with open(file, "r") as f: for l in f: if not l.startswith("#") and l.strip() != "" and not l.startswith( "Epitope") and not l.startswith("Sequence"): #print l, l.split() pep = l.split()[0].strip() peptides.append(Peptide(pep)) return peptides
def optitope(input_affinities, input_peptides, input_alleles, output_vaccine, epitopes, min_alleles, min_proteins): with open(input_peptides) as f: reader = csv.DictReader(f) peptides = { # we don't really need the actual protein sequence, just fill it with the id to make it unique Peptide(r['peptide']): set(Protein(gid, gene_id=gid) for gid in r['proteins'].split(';')) for r in reader } LOGGER.info('Loaded %d peptides', len(peptides)) allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict( 'index') thresholds = { allele.replace('HLA-', ''): data['threshold'] for allele, data in allele_data.items() } LOGGER.info('Loaded %d alleles', len(thresholds)) affinities = utilities.affinities_from_csv(input_affinities, allele_data, peptide_coverage=peptides) LOGGER.info('Loaded %d affinities', len(affinities)) LOGGER.info("Creating vaccine...") model = OptiTope(affinities, thresholds, k=epitopes, solver='gurobi') if min_alleles > 0: model.activate_allele_coverage_const(min_alleles) LOGGER.info('Vaccine will cover at least %f alleles', min_alleles) if min_proteins > 0: model.activate_antigen_coverage_const(min_proteins) LOGGER.info('Vaccine will cover at least %f proteins', min_proteins) vaccine = model.solve() LOGGER.info('Vaccine summary:') with open(output_vaccine, 'w') as f: writer = csv.writer(f) writer.writerow(('cocktail', 'index', 'epitope')) total_ig = 0.0 for i, epitope in enumerate(vaccine): writer.writerow((0, i, epitope)) epitope_immunog = sum(model.instance.p[a] * model.instance.i[epitope, a] for a in model.instance.A) total_ig += epitope_immunog LOGGER.info(' %s - %.2f', epitope, epitope_immunog) LOGGER.info('Total immunogenicity: %.2f', total_ig)
def get_cleavage_score_process(penalty, cleavage_model, window_size, epitopes): #predictor = CleavageSitePredictorFactory(cleavage_model) assert cleavage_model.lower() == 'pcm' from Fred2.CleavagePrediction import PCM predictor = PCM() results = [] for ep_from, ep_to in epitopes: preds = predictor.predict(Peptide(ep_from + ep_to)) score = 0.0 join_pos = len(ep_from) - 1 half_size = int((window_size - 1) / 2) for i, (_, lik) in enumerate(preds.values): if i - half_size <= join_pos <= i + half_size: weight = -1 if i == join_pos else penalty score += weight * lik results.append((ep_from, ep_to, score)) return results
def append_score(dt2, alleles): """ Given a choped sequence (output from sliding_window()), append the immunogenicity scores """ peptides_to_compute = [ Peptide(peptide) for peptide in set(list(dt2["MT"]) + list(dt2["WT"])) ] res = fred2wrap.predict_peptide_effects(peptides_to_compute, alleles) res["peptide"] = [str(peptide) for peptide in res["peptide"]] full = pd.merge(dt2, res, how='left', left_on="WT", right_on="peptide") full = full.rename(columns={'score': 'WT_score'}) del full["peptide"] full = pd.merge(full, res, how='left', left_on=["MT", "method", "allele"], right_on=["peptide", "method", "allele"]) full = full.rename(columns={'score': 'MT_score'}) del full["peptide"] return full
def generate_epitope_result(input, allele_file): """ generates EpitopePredictionResult from output of epitopeprediction and neoepitopeprediction """ #first generate alleles in allele file alleles = {} with open(allele_file, "r") as af: for l in af: allele, freq = l.split("\t") alleles[allele] = Allele(allele, prob=float(freq)) r_raw = pandas.read_csv(input, sep="\t") res_dic = {} method = r_raw.loc[0, "Method"] columns = set(["Sequence", "Method", "Antigen ID", "Variant"]) alleles_raw = [c for c in r_raw.columns if c not in columns] for k, row in r_raw.iterrows(): seq = row["Sequence"] protPos = collections.defaultdict(list) try: protPos = {Protein(p, gene_id=p, transcript_id=p): [0] for p in str(row["Antigen ID"]).split(",")} except KeyError: pass pep = Peptide(seq, protein_pos=protPos) for a in alleles_raw: if a in alleles: if alleles[a] not in res_dic: res_dic[alleles[a]] = {} res_dic[alleles[a]][pep] = float(row[a]) if not res_dic: sys.stderr.write("HLA alleles of population and HLA used for prediction did not overlap.") sys.exit(-1) df_result = EpitopePredictionResult.from_dict(res_dic) df_result.index = pandas.MultiIndex.from_tuples([tuple((i, method)) for i in df_result.index], names=['Seq', 'Method']) return df_result, method
def setUp(self): self.simple = Peptide("SYFPEITHI") self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK" self.gcg_t1 = Transcript("", transcript_id="GLUC_HUMAN") gcg_p1 = Protein(self.gcg_ps, transcript_id='GLUC_HUMAN', orig_transcript=self.gcg_t1) self.w_p = Peptide("PROTEIN", {gcg_p1: [0]}) self.gcg_p1 = gcg_p1 self.gcg_v1 = Variant( "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', { "GLUC_HUMAN": MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D") }, False, False) gcg_p1_copy = copy.deepcopy(gcg_p1) gcg_p1_copy.vars = {0: [self.gcg_v1]} self.w_v = Peptide("VARIANT", {gcg_p1_copy: [0]})
def popcover(input_peptides, input_affinities, input_alleles, output_vaccine, processes, epitopes): with open(input_peptides) as f: reader = csv.DictReader(f) peptides = { Peptide(r['peptide']): set(r['proteins'].split(';')) for r in reader } LOGGER.info('Loaded %d peptides', len(peptides)) allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict( 'index') thresholds = { allele.replace('HLA-', ''): data['threshold'] for allele, data in allele_data.items() } LOGGER.info('Loaded %d alleles', len(thresholds)) affinities = utilities.affinities_from_csv(input_affinities, allele_data, peptides) LOGGER.info('Loaded %d affinities', len(affinities)) LOGGER.info("Creating vaccine...") model = PopCover(affinities, thresholds, k=epitopes, processes=processes if processes > 0 else (mp.cpu_count() + processes)) vaccine = model.solve() with open(output_vaccine, 'w') as f: writer = csv.writer(f) writer.writerow(('cocktail', 'index', 'epitope')) for i, epitope in enumerate(vaccine): writer.writerow((0, i, epitope)) LOGGER.info(' %s', epitope)
def read_epitope_input(args, alleles, exclude): """ reads in epitope files generated by NGSAnalyzer+ImmogenicityPredictor Header NGSAnalyzer: mutation - position of the mutation in the reference genome (currently hg19); format: chromosome_position; the position is zero-based gene - gene affected by the mutation transcript - transcript affected by the mutation (UCSC known genes transcript ID) transcript_expression - expression in RPKM/FPKM of the affected transcript neopeptide - peptide resulting from the mutation in the given transcript length_of_neopeptide - length of the neopeptide HLA - HLA used for the binding prediction of the neopeptide HLA_class1_binding_prediction - predicted binding affinity (currently rank score of IEDB consensus tool) Header ImmunogenicityPredictor: immunogenicity - predicted immunogenicity for specific HLA allele in column distance - distance-to-self estimation specific for HLA allele in column [uncertainty] - If immunopredictor can estimate prediction uncertainty :param args: Input arguments :param alleles: HLA alleles :param exlude: excluded peptides :return: df_epitope - EpitopePredictionResult distance - dict(pep_string, float) expression - dict(gene_id, float) uncertainty - dict(pep_string, float) pep_to_mutation - dict(pep_string, mutation_string) """ distance = {} uncertainty = {} expression = {} seq_to_pep = {} gene_to_prot = {} hla_to_allele = {a.name: a for a in alleles} df_pred = {a: {} for a in alleles} pep_to_mutation = {} with open(args.input, "rU") as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: seq = row["neopeptide"] if (exclude is None or seq not in exclude) and len(seq) > 0: if seq in seq_to_pep: pep = seq_to_pep[seq] else: pep = Peptide(seq.upper()) seq_to_pep[seq] = pep try: allele = hla_to_allele[row["HLA"].replace("HLA-", "")] except: logging.warning( "HLA {allele} was not contained in the provided allele file. Please check your input." .format(allele=row["HLA"])) continue gene = row["gene"] if gene in gene_to_prot: prot = gene_to_prot[gene] else: prot = Protein("", gene_id=gene, transcript_id=gene) gene_to_prot[gene] = prot pep.proteins[prot.transcript_id] = prot pep.proteinPos[prot.transcript_id].append(0) pep_to_mutation.setdefault(seq, []).append(row["mutation"]) expression.setdefault(gene, []).append( float(row["transcript_expression"])) if args.rank: df_pred[allele][pep] = max( 0., 1. - float(row[args.immunogenicity]) / 100.0) else: df_pred[allele][pep] = max( 0., 1. - math.log(float(row[args.immunogenicity]), 50000) ) if args.immunogenicity == "HLA_class1_binding_prediction" else float( row[args.immunogenicity]) if args.distance is not None: distance[(seq, allele.name)] = float(row[args.distance]) if args.uncertainty is not None: uncertainty[(seq, allele.name)] = float(row[args.uncertainty]) if args.taa is not None: pep.log_metadata("taa", row[args.taa].upper() == "TAA") expression = {k: max(v) for k, v in expression.iteritems()} df_result = EpitopePredictionResult.from_dict(df_pred) df_result.index = pandas.MultiIndex.from_tuples( [tuple((i, "custom")) for i in df_result.index], names=['Seq', 'Method']) return df_result, distance, expression, uncertainty, pep_to_mutation
def toplevel_predictor(x): predictor = EpitopePredictorFactory("netMHC", version="3.4") peps = [Peptide(i) for i in x] return predictor.predict(peps)
def setUp(self): self.peptides = [ Peptide("KLLPRLPGV"), Peptide("YLYDHLAPM"), Peptide("ALYDVVSTL") ]
def solve(self, start=0, threads=None, options=None): """ Solve the epitope assembly problem with spacers optimally using integer linear programming. .. note:: This can take quite long and should not be done for more and 30 epitopes max! Also, one has to disable pre-solving steps in order to use this model. :param int start: Start length for spacers (default 0). :param int threads: Number of threads used for spacer design. Be careful, if options contain solver threads it will allocate threads*solver_threads cores! :param dict(str,str) options: Solver specific options as keys and parameters as values :return: A list of ordered :class:`~Fred2.Core.Peptide.Peptide` :rtype: list(:class:`~Fred2.Core.Peptide.Peptide`) """ def __load_model(name, model): return getattr( __import__("Fred2.Data.pssms." + name + ".mat." + model, fromlist=[model]), model) options = dict() if options is None else options threads = mp.cpu_count() if threads is None else threads pool = mp.Pool(threads) #prepare parameters cn = min(self.__clev_pred.supportedLength) cl_pssm = __load_model(self.__clev_pred.name, self.__clev_pred.name + "_" + str(cn)) cleav_pos = self.__clev_pred.cleavagePos en = self.__en epi_pssms = {} allele_prob = {} delete_alleles = [] if self.__epi_pred.name in ["smm", "smmpmbec", "comblibsidney"]: self.__thresh = { k: (1 - math.log(v, 50000) if v != 0 else 0) for k, v in self.__thresh.items() } for a in self.__alleles: allele_prob[a.name] = a.prob try: pssm = __load_model( self.__epi_pred.name, "%s_%i" % (self.__epi_pred.convert_alleles([a])[0], en)) if self.__epi_pred.name in [ "smm", "smmpmbec", "comblibsidney" ]: for j, v in pssm.items(): for aa, score in v.items(): epi_pssms[j, aa, a.name] = 1 / 10. - math.log( math.pow(10, score), 50000) else: for j, v in pssm.items(): for aa, score in v.items(): epi_pssms[j, aa, a.name] = score except ImportError: delete_alleles.append(a) #delete alleles from model that generated an error while loading matrices for a in delete_alleles: del allele_prob[a.name] del self.__thresh[a.name] if not epi_pssms: raise ValueError( "Selected alleles with epitope length are not supported by the prediction method." ) #print "run spacer designs in parallel using multiprocessing" res = pool.map( _runs_lexmin, ((str(ei), str(ej), i, en, cn, cl_pssm, epi_pssms, cleav_pos, allele_prob, self.__alpha, self.__thresh, self.__solver, self.__beta, options) for i in range(start, self.__k + 1) for ei, ej in itr.product(self.__peptides, repeat=2) if ei != ej)) pool.close() pool.join() opt_spacer = {} adj_matrix = {} inf = float("inf") #print res #print "find best scoring spacer for each epitope pair" for ei, ej, score, epi, spacer, c1, c2, non_c in res: #print ei,spacer,ej,min(c1,c2),c1,c2 if adj_matrix.get((ei, ej), inf) > -min(c1, c2): adj_matrix[(ei, ej)] = -min(c1, c2) opt_spacer[(ei, ej)] = spacer self.spacer = opt_spacer #print "solve assembly with generated adjacency matrix" assembler = EpitopeAssembly(self.__peptides, self.__clev_pred, solver=self.__solver, matrix=adj_matrix) res = assembler.solve(options=options) #generate output sob = [] for i in range(len(res) - 1): ei = str(res[i]) ej = str(res[i + 1]) if not i: sob.append(Peptide(ei)) sob.append(Peptide(opt_spacer[ei, ej])) sob.append(Peptide(ej)) return sob
def __init__(self, peptides, pred, solver="glpk", weight=0.0, matrix=None, verbosity=0): if not isinstance(pred, ACleavageSitePrediction): raise ValueError( "Cleave site predictor must be of type ACleavageSitePrediction" ) if len(peptides) > 60: warnings.warn( "The peptide set exceeds 60. Above this level one has to expect " + "considerably long running times due to the complexity of the problem." ) #Generate model #1. Generate peptides for which cleave sites have to be predicted #2. generate graph with dummy element self.__verbosity = verbosity pep_tmp = peptides[:] pep_tmp.append("Dummy") edge_matrix = {} fragments = {} seq_to_pep = {} self.neo_cleavage = {} self.good_cleavage = {} if matrix is None: for start, stop in itr.combinations(pep_tmp, 2): if start == "Dummy" or stop == "Dummy": seq_to_pep[str(start)] = start seq_to_pep[str(stop)] = stop edge_matrix[(str(start), str(stop))] = 0 edge_matrix[(str(stop), str(start))] = 0 else: start_str = str(start) stop_str = str(stop) frag = Peptide(start_str + stop_str) garf = Peptide(stop_str + start_str) fragments[frag] = (start_str, stop_str) fragments[garf] = (stop_str, start_str) cleave_pred = pred.predict(list(fragments.keys())) #cleave_site_df = cleave_pred.xs((slice(None), (cleavage_pos-1))) for i in set(cleave_pred.index.get_level_values(0)): fragment = "".join(cleave_pred.ix[i]["Seq"]) start, stop = fragments[fragment] cleav_pos = len(str(start)) - 1 edge_matrix[(start, stop)] = -1.0 * ( cleave_pred.loc[(i, len(str(start)) - 1), pred.name] - weight * sum(cleave_pred.loc[(i, j), pred.name] for j in range(cleav_pos - 1, cleav_pos + 4, 1) if j != cleav_pos)) self.neo_cleavage[(start, stop)] = sum( cleave_pred.loc[(i, j), pred.name] for j in range(cleav_pos - 1, cleav_pos + 4, 1) if j != cleav_pos) self.good_cleavage[(start, stop)] = cleave_pred.loc[(i, len(str(start)) - 1), pred.name] else: edge_matrix = matrix seq_to_pep = {str(p): p for p in pep_tmp} for p in seq_to_pep.keys(): if p != "Dummy": edge_matrix[(p, "Dummy")] = 0 edge_matrix[("Dummy", p)] = 0 self.__seq_to_pep = seq_to_pep #3. initialize ILP self.__solver = SolverFactory(solver) model = ConcreteModel() E = [x for x in list(seq_to_pep.keys()) if x != "Dummy"] model.E = Set(initialize=E) model.E_prime = Set(initialize=list(seq_to_pep.keys())) model.ExE = Set(initialize=itr.permutations(E, 2), dimen=2) model.w_ab = Param(model.E_prime, model.E_prime, initialize=edge_matrix) model.card = Param(initialize=len(model.E_prime)) model.x = Var(model.E_prime, model.E_prime, within=Binary) model.u = Var(model.E, domain=PositiveIntegers, bounds=(2, model.card)) model.obj = Objective( rule=lambda mode: sum(model.w_ab[a, b] * model.x[a, b] for a in model.E_prime for b in model.E_prime if a != b), sense=minimize) model.tour_constraint_1 = Constraint( model.E_prime, rule=lambda model, a: sum(model.x[a, b] for b in model.E_prime if a != b) == 1) model.tour_constraint_2 = Constraint( model.E_prime, rule=lambda model, a: sum(model.x[b, a] for b in model.E_prime if a != b) == 1) model.cardinality_constraint = Constraint( model.ExE, rule=lambda model, a, b: model.u[a] - model.u[b] + 1 <= (model.card - 1) * (1 - model.x[a, b])) self.instance = model if self.__verbosity > 0: print("MODEL INSTANCE") self.instance.pprint()
def approximate(self, start=0, threads=1, options=None): """ Approximates the Eptiope Assembly problem by applying Lin-Kernighan traveling salesman heuristic LKH implementation must be downloaded, compiled, and globally executable. Source code can be found here: http://www.akira.ruc.dk/~keld/research/LKH/ :param int start: Start length for spacers (default 0). :param int threads: Number of threads used for spacer design. Be careful, if options contain solver threads it will allocate threads*solver_threads cores! :param dict(str,str) options: Solver specific options (threads for example) :return: A list of ordered :class:`~Fred2.Core.Peptide.Peptide` :rtype: list(:class:`~Fred2.Core.Peptide.Peptide`) """ def __load_model(name, model): return getattr( __import__("Fred2.Data.pssms." + name + ".mat." + model, fromlist=[model]), model) options = dict() if options is None else options threads = mp.cpu_count() if threads is None else threads pool = mp.Pool(threads) # prepare parameters cn = min(self.__clev_pred.supportedLength) cl_pssm = __load_model(self.__clev_pred.name, self.__clev_pred.name + "_" + str(cn)) cleav_pos = self.__clev_pred.cleavagePos en = self.__en epi_pssms = {} allele_prob = {} delete_alleles = [] if self.__epi_pred.name in ["smm", "smmpmbec", "comblibsidney"]: self.__thresh = { k: (1 - math.log(v, 50000) if v != 0 else 0) for k, v in self.__thresh.items() } for a in self.__alleles: allele_prob[a.name] = a.prob try: pssm = __load_model( self.__epi_pred.name, "%s_%i" % (self.__epi_pred.convert_alleles([a])[0], en)) if self.__epi_pred.name in [ "smm", "smmpmbec", "comblibsidney" ]: for j, v in pssm.items(): for aa, score in v.items(): epi_pssms[j, aa, a.name] = 1 / 10. - math.log( math.pow(10, score), 50000) else: for j, v in pssm.items(): for aa, score in v.items(): epi_pssms[j, aa, a.name] = score except ImportError: delete_alleles.append(a) # delete alleles from model that generated an error while loading matrices for a in delete_alleles: del allele_prob[a.name] del self.__thresh[a.name] if not epi_pssms: raise ValueError( "Selected alleles with epitope length are not supported by the prediction method." ) # print "run spacer designs in parallel using multiprocessing" res = pool.map( _runs_lexmin, ((str(ei), str(ej), i, en, cn, cl_pssm, epi_pssms, cleav_pos, allele_prob, self.__alpha, self.__thresh, self.__solver, self.__beta, options) for i in range(start, self.__k + 1) for ei, ej in itr.product(self.__peptides, repeat=2) if ei != ej)) pool.close() pool.join() opt_spacer = {} adj_matrix = {} inf = float("inf") # print res # print "find best scoring spacer for each epitope pair" for ei, ej, score, epi, spacer, c1, c2, non_c in res: if adj_matrix.get((ei, ej), inf) > -min(c1, c2): adj_matrix[(ei, ej)] = -min(c1, c2) opt_spacer[(ei, ej)] = spacer self.spacer = opt_spacer #print "solve assembly with generated adjacency matrix" assembler = EpitopeAssembly(self.__peptides, self.__clev_pred, solver=self.__solver, matrix=adj_matrix) res = assembler.approximate() #generate output sob = [] for i in range(len(res) - 1): ei = str(res[i]) ej = str(res[i + 1]) if not i: sob.append(Peptide(ei)) sob.append(Peptide(opt_spacer[ei, ej])) sob.append(Peptide(ej)) return sob
import pandas as pd from docopt import docopt if __name__ == "__main__": arguments = docopt(__doc__) file_in = arguments["--input"] if not file_in: file_in = "./data/binders.csv" file_out = arguments["--output"] dt = pd.read_csv(file_in) dt = dt[dt["Sequence"].notnull()] dt = dt[dt["Sequence"].str.len() == 9] peptides = [Peptide(peptide) for peptide in dt["Sequence"]] dt["allele"] = dt["allele"].str.replace("\*","").\ str.replace("(-[a-zA-Z]+)([0-9]{2})([0-9]{2})","\\1*\\2:\\3").\ str.replace("w","").\ str.replace("HLA-","") dt.rename(columns={"Sequence": "peptide"}, inplace=True) alleles = [Allele(allele) for allele in dt["allele"].unique().tolist()] res = fred2wrap.predict_peptide_effects( peptides, alleles=dt["allele"].unique().tolist()) res["peptide"] = [peptide.tostring() for peptide in res["peptide"]] res["allele"] = [str(allele) for allele in res["allele"]] res = res.pivot_table(index=["peptide", "allele"], columns='method',
def main(): parser = argparse.ArgumentParser( description= """The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)") parser.add_argument( "-a", "--alleles", required=True, help= "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument( "-k", "--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument( "-al", "--alpha", default=0.99, type=float, help= "Specifies the first-order preference of the user in the model [0,1] (default 0.99)" ) parser.add_argument( "-be", "--beta", default=0.0, type=float, help= "Specifies the second-order preference of the user in the model [0,1] (default 0)." ) parser.add_argument( "-cp", "--cleavage_prediction", default="PCM", help= "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument( "-ep", "--epitope_prediction", default="Syfpeithi", help= "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument( "-thr", "--threshold", default=20, type=float, help= "Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument( "-t", "--threads", type=int, default=None, help= "Specifies number of threads. If not specified all available logical cpus are used." ) parser.add_argument( "--ips-solver", default="cplex", choices=["cplex", "cbc"], help= "Executable name of the IPS solver. Executable needs to be available in PATH." ) parser.add_argument("--tsp-solution", default="approximate", choices=["approximate", "optimal"], help="Type of solution of the TSP") parser.add_argument( "--random-order", action="store_true", help= "Indicate whether to generate a random ordered string-of-beads polypeptide" ) parser.add_argument( "--seed", type=int, default=1, help="Seed for random ordering of string-of-beads polypeptide") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in [ "PCM", "PROTEASMM_C", "PROTEASMM_S" ]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in [ "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC" ]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name: args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides, cl_pred, epi_pred, alleles, k=args.max_length, en=9, threshold=thr, solver=args.ips_solver, alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads if args.tsp_solution == "approximate": svbws = solver.approximate(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) else: svbws = solver.solve(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) # Generate random ordered string-of-breads, but still uses optimal spacers # determined from the above solve function. if args.random_order: print "Generating a randomly ordered polypeptide" random.seed(args.seed) random_order_sob = [] random.shuffle(peptides) for i in range(len(peptides)): # Break from loop once we hit the last peptide if i == len(peptides) - 1: random_order_sob.extend([Peptide(str(peptides[i]))]) break left_peptide = str(peptides[i]) right_peptide = str(peptides[i + 1]) opt_spacer = solver.spacer[(left_peptide, right_peptide)] # Right peptide gets added in the next iteration random_order_sob.extend( [Peptide(left_peptide), Peptide(opt_spacer)]) svbws = random_order_sob print print "Resulting String-of-Beads: ", "-".join(map(str, svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str, svbws)))
if __name__ == "__main__": arguments = docopt(__doc__) file_in = arguments["--input"] if not file_in: file_in = os.path.expanduser("data/immunogenic_SNVs-training_sets.csv") file_out = arguments["--output"] if not file_out: file_out = os.path.expanduser("data/immunogenic_SNVs-model_data.csv") dt = pd.read_csv(file_in) dt = dt[dt["mutant_sequence"].notnull() & dt["wt_sequence"].notnull()] # dt = dt[dt["Sequence"].str.len() == 9] all_peptides = dt["mutant_sequence"].append(dt["wt_sequence"]).unique() peptides = [Peptide(peptide) for peptide in all_peptides] dt["allele"] = dt["allele"].str.replace("\*", "").\ str.replace(":", "").\ str.replace("(-[a-zA-Z]+)([0-9]{2})([0-9]{2})", "\\1*\\2:\\3").\ str.replace("w", "").\ str.replace("HLA-", "") # TODO # dt.rename(columns = {"Sequence": "peptide"}, inplace = True) alleles = [] valid_alleles = [] for allele in dt["allele"].tolist(): try: a = Allele(allele) valid_alleles.append(True)
parser.add_argument('alleles', metavar='allele', nargs='+', type=str, help='HLA alleles to predict against.') args = parser.parse_args() #################################################################################################### from Fred2.Core import Allele, Peptide from Fred2.EpitopePrediction import EpitopePredictorFactory #################################################################################################### # Convert raw peptide sequences to Fred2.Core.Peptide objects all_peptides = [Peptide(row.strip()) for row in args.peptides] # Separate peptides by length peptides_by_length = {} for peptide in all_peptides: if not len(peptide) in peptides_by_length: peptides_by_length[len(peptide)] = [] peptides_by_length[len(peptide)].append(peptide) # Convert raw allele strings to Fred2.Core.Allele objects alleles = [Allele(allele) for allele in args.alleles] # Instatiate predictor predictor = EpitopePredictorFactory("Syfpeithi")
def read_epitope_input(args, alleles, exclude): """ reads in epitope files generated by NGSAnalyzer+ImmogenicityPredictor Header NGSAnalyzer: mutation - position of the mutation in the reference genome (currently hg19); format: chromosome_position; the position is zero-based gene - gene affected by the mutation transcript - transcript affected by the mutation (UCSC known genes transcript ID) transcript_expression - expression in RPKM/FPKM of the affected transcript neopeptide - peptide resulting from the mutation in the given transcript length_of_neopeptide - length of the neopeptide HLA - HLA used for the binding prediction of the neopeptide HLA_class1_binding_prediction - predicted binding affinity (currently rank score of IEDB consensus tool) Header ImmunogenicityPredictor: immunogenicity - predicted immunogenicity for specific HLA allele in column distance - distance-to-self estimation specific for HLA allele in column [uncertainty] - If immunopredictor can estimate prediction uncertainty :param args: Input arguments :param alleles: HLA alleles :param exlude: excluded peptides :return: df_epitope - EpitopePredictionResult distance - dict(pep_string, float) expression - dict(gene_id, float) uncertainty - dict(pep_string, float) pep_to_mutation - dict(pep_string, mutation_string) """ distance = {} uncertainty = {} expression = {} seq_to_pep = {} gene_to_prot = {} hla_to_allele = {a.name:a for a in alleles} df_pred = {a:{} for a in alleles} pep_to_mutation = {} with open(args.input, "rU") as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: seq = row["neopeptide"] if (exclude is None or seq not in exclude) and len(seq) > 0: if seq in seq_to_pep: pep = seq_to_pep[seq] else: pep = Peptide(seq.upper()) seq_to_pep[seq] = pep try: allele = hla_to_allele[row["HLA"].replace("HLA-","")] except: logging.warning( "HLA {allele} was not contained in the provided allele file. Please check your input.".format( allele=row["HLA"])) continue gene = row["gene"] if gene in gene_to_prot: prot = gene_to_prot[gene] else: prot = Protein("", gene_id=gene, transcript_id=gene) gene_to_prot[gene] = prot pep.proteins[prot.transcript_id]=prot pep.proteinPos[prot.transcript_id].append(0) pep_to_mutation.setdefault(seq, []).append(row["mutation"]) expression.setdefault(gene,[]).append(float(row["transcript_expression"])) if args.rank: df_pred[allele][pep] = max(0., 1. - float(row[args.immunogenicity])/100.0) else: df_pred[allele][pep] = max(0., 1. - math.log(float(row[args.immunogenicity]), 50000)) if args.immunogenicity == "HLA_class1_binding_prediction" else float( row[args.immunogenicity]) if args.distance is not None: distance[(seq,allele.name)] = float(row[args.distance]) if args.uncertainty is not None: uncertainty[(seq,allele.name)] = float(row[args.uncertainty]) if args.taa is not None: pep.log_metadata("taa", row[args.taa].upper() == "TAA" ) expression = {k:max(v) for k,v in expression.iteritems()} df_result = EpitopePredictionResult.from_dict(df_pred) df_result.index = pandas.MultiIndex.from_tuples([tuple((i, "custom")) for i in df_result.index], names=['Seq', 'Method']) return df_result, distance, expression, uncertainty, pep_to_mutation
def setUp(self): self.seqs = [Peptide("SYFPEISYFP"), Protein("IHTIEPFYSIHTIEPFYSIHTIEPFYSIHTIEPFYSIHTIEPFYS", transcript_id="ID-01", gene_id="FOXP3")] self.transcript = Transcript("")
""") _imm, _non = set(), set() if args.dataset == 'iedb.tcell': _imm, _non = pepdata.iedb.tcell.load_classes(nrows=args.n) elif args.dataset == 'iedb.mhc': _imm, _non = pepdata.iedb.mhc.load_classes(nrows=args.n) elif args.dataset == 'imma2': _imm, _non = pepdata.imma2.load_classes() elif args.dataset == 'stdin': _imm = [line for line in stdin] else: print("available datasets: iedb.tcell, ideb.mhc, imma2") exit() imm = [Peptide(elem) for elem in _imm] non = [Peptide(elem) for elem in _non] #hla='HLA-A2|HLA-A\*02' #df = () # #if args.dataset == 'iedb.tcell': # df = pepdata.iedb.tcell.load_dataframe(nrows=args.n) #elif args.dataset == 'iedb.mhc': # df = pepdata.iedb.mhc.load_dataframe(nrows=args.n) #elif args.dataset == 'iedb.bcell': # df = pepdata.iedb.bcell.load_dataframe(nrows=args.n) #elif args.dataset == 'stdin': # df = [ line for line in stdin ] #else: # print("available datasets: iedb.tcell, ideb.mhc, imma2")