def setUp(self): self.proteins=[] self.alleles = [Allele("HLA-A*01:01"),Allele("HLA-B*07:02"), Allele("HLA-C*03:01")] self.peptides = [Peptide(p) for p in """SFSIFLLAL GHRMAWDMM VYEADDVIL CFTPSPVVV FLLLADARV GPADGMVSK YLYDHLAPM GLRDLAVAV GPTPLLYRL TWVLVGGVL IELGGKPAL LAGGVLAAV QYLAGLSTL NFVSGIQYL VLSDFKTWL ARPDYNPPL KLLPRLPGV RHTPVNSWL GLYLFNWAV ALYDVVSTL RRCRASGVL WPLLLLLLA VTYSLTGLW YFVIFFVAA""".split()] self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) self.thresh = {"A*01:01":10,"B*07:02":10,"C*03:01":10}
def test_single_peptide_input_mhcII(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if not isinstance(model, AExternalEpitopePrediction): if all(a.name in model.supportedAlleles for a in self.mhcII): res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not ( mo.version == "0.1" and mo.name == "netmhc"): print "Testing", mo.name, "version", mo.version try: if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles): mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0]) if any(a.name in mo.supportedAlleles for a in self.mhcI): mo.predict(self.peptides_mhcI, alleles=self.mhcI[0]) print "Success" except RuntimeError as e: #catch only those stemming from binary unavailability if "could not be found in PATH" not in e.message: raise e #all others do not except else: print mo.name, "not available"
def predict_peptide_effects(peptides, alleles=None): """ Predict the peptide effect for all the available methods on the machine Args: peptides (list of Peptides): Usually an output from read_fasta alleles (list of chars): Alleles for which to run the predictors Returns: pd.DataFrame: Tidy pd.DataFrame. If the method is unable to predict for a particular value the rows are not present. Example: >>> peptides = [Peptide("SYFPEITHI"), Peptide("FIASNGVKL"), Peptide("LLGATCMFV")] >>> alleles = ['A*02:16', 'B*45:01'] >>> predict_peptide_effects(peptides, alleles = alleles).head() Seq Method allele score 0 (F, I, A, S, N, G, V, K, L) arb A*02:16 594.691144 1 (F, I, A, S, N, G, V, K, L) smm A*02:16 159.768074 2 (F, I, A, S, N, G, V, K, L) smmpmbec A*02:16 211.977614 4 (F, I, A, S, N, G, V, K, L) unitope A*02:16 0.527849 5 (L, L, G, A, T, C, M, F, V) arb A*02:16 6.784222 """ dt = valid_predictors() results = [] for i in range(len(dt)): # subset to valid alleles if alleles is not None: valid_alleles = dt.iloc[i]["supportedAlleles"].intersection( alleles) if len(valid_alleles) == 0: continue valid_alleles = [Allele(al) for al in valid_alleles] else: valid_alleles = None method = dt.iloc[i]["name"] print("method: ", method) # TODO - use try, except t0 = time.time() try: results.append( EpitopePredictorFactory(method).predict(peptides, alleles=valid_alleles)) except: print("Error! Unable to run ", method, ": ", sys.exc_info()) t1 = time.time() print(" - runtime: ", str(t1 - t0)) df = results[0].merge_results(results[1:]).reset_index() dfm = pd.melt(df, id_vars=["Seq", "Method"], var_name="allele", value_name="score") dfm = dfm[dfm["score"].notnull()] dfm.rename(columns={'Seq': 'peptide', 'Method': 'method'}, inplace=True) return dfm
def test_epitope_conservation_constraint(self): import random self.result = EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) conservation = {} for e in self.result.index.levels[0]: conservation[str(e)] = random.random() pt = OptiTope(self.result, self.thresh, k=3, solver="cbc", verbosity=0) pt.activate_epitope_conservation_const(0.5, conservation=conservation) for e in pt.solve(): print e, conservation[e]
def test_path_and_optional_parameters_netctl(self): netctlpan = EpitopePredictorFactory("NetCTLpan") exe = netctlpan.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): print netctlpan.predict(self.peptides_mhcI, alleles=self.mhcI, commad=exe_try, options="-wt 0.05 -wc 0.225 -ethr 0.5")
def test_path_option_and_optional_parameters_netmhc(self): netmhc = EpitopePredictorFactory("NetMHC") exe = netmhc.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): r = netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, command=exe_try, options="--sort", chunksize=1) self.assertTrue(len(r) == len(self.peptides_mhcI)) self.assertAlmostEqual(r["A*02:01"]["SYFPEITHI"]["netmhc"], 0.150579105869, places=7, msg=None, delta=None) self.assertAlmostEqual(r["A*02:01"]["IHTIEPFYS"]["netmhc"], 0.0619540879359, places=7, msg=None, delta=None)
def test_pareto_front_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name:10000 for a in allele} comp = lambda a,b: a <= b assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=0) r = assembler.paretosolve() print(r) #print assembler.solve(eps=2.0)
def run_predictor(pred, dataset): predictor = EpitopePredictorFactory(pred) results = () try: results = predictor.predict(dataset, alleles=[Allele(a) for a in args.allele]) print(results) print(results.describe()) except ValueError: pass return (len(results), len(dataset))
def test_standart_functions(self): """ Tests default functions needs GLPK installed :return: """ epi_pred = EpitopePredictorFactory("Syfpeithi") cl_pred = CleavageSitePredictorFactory("PCM") sbws = EpitopeAssemblyWithSpacer(self.epis,cl_pred,epi_pred,self.alleles) sol = sbws.solve() print sol assert all(i == str(j) for i,j in zip(["GHRMAWDMM","HH","VYEADDVIL"],sol))
def test_pareto_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name:10000 for a in allele} comp = lambda a,b: a <= b print(ep_pred.predict(self.peptides,alleles=allele)) #cl_pred, ep_pred, alleles, threshold, comparator, length=9 assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1) r = assembler.solve(eps=1e10, order=(1,0)) print(r)
def test_allele_cov_constraint(self): """ tests the allele converage constraints :return: """ #self.alleles.extend([Allele("HLA-A*02:01"),Allele("HLA-B*15:01")]) #self.thresh.update({"A*02:01":0,"B*15:01":0}) self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) opt = OptiTope(self.result, self.thresh, k=3, solver="cbc", verbosity=0) opt.activate_allele_coverage_const(0.99) r = opt.solve() self.assertTrue(len(set(str(p) for p in r) - set(["GPTPLLYRL", "QYLAGLSTL", "ALYDVVSTL"])) == 0 )
def test_unsupported_allele_length_combination_exception(self): """ Tests default functions needs GLPK installed :return: """ epi_pred = EpitopePredictorFactory("Syfpeithi") cl_pred = CleavageSitePredictorFactory("PCM") alleles = [Allele("HLA-A*26:01", prob=0.5)] sbws = EpitopeAssemblyWithSpacer(self.epis, cl_pred, epi_pred, alleles, solver="cbc") self.assertRaises(ValueError, sbws.solve)
def predictor_info(method): """ Get all the information about a particular predictor/method from Fred2 """ predictor = EpitopePredictorFactory(method) try: is_in_path = predictor.is_in_path() except: is_in_path = None try: command = predictor.command except: command = None method_hash = { "syfpeithi": "T-cell epitope", "bimas": "MHC-I binding", "svmhc": "MHC-I binding", "arb": "MHC-I binding", "smm": "MHC-I binding", "smmpmbec": "MHC-I binding", "epidemix": "MHC-I binding", "comblib": "MHC-I binding", "comblibsidney": "MHC-I binding", "pickpocket": "MHC-I binding", "netmhc": "MHC-I binding", "netmhcpan": "MHC-I binding", "hammer": "MHC-II binding", "tepitopepan": "MHC-II binding", "netmhcii": "MHC-II binding", "netmhciipan": "MHC-II binding", "unitope": "T-cell epitope", "netctlpan": "T-cell epitope", } retdict = { "is_in_path": is_in_path, "name": method, "supportedAlleles": predictor.supportedAlleles, "supportedLength": predictor.supportedLength, "command": command, "version": predictor.version, "type": method_hash.get(method) } return retdict
def __main__(): parser = argparse.ArgumentParser( "Write out information about supported models by Fred2 for available prediction tool versions." ) parser.add_argument('-v', '--versions', help='File with used software versions.', required=True) args = parser.parse_args() # NOTE this needs to be updated manually, if other methods should be used in the future available_methods = [ 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2' ] with open(args.versions, 'r') as versions_file: tool_version = [(row[0].split()[0], str(row[1])) for row in csv.reader(versions_file, delimiter=":")] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each method the corresponding tool version methods = { method.strip(): version.strip() for tool, version in tool_version for method in available_methods if tool.lower() in method.lower() } for method, version in methods.items(): if (version not in EpitopePredictorFactory.available_methods()[method]): raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") predictor = EpitopePredictorFactory(method, version=version) with open(method + ".v" + str(version) + ".supported_alleles.txt", 'w') as output: for a in sorted(predictor.supportedAlleles): output.write(convert_allele_back(a) + "\n") with open(method + ".v" + str(version) + ".supported_lengths.txt", 'w') as output: for l in sorted(predictor.supportedLength): output.write(str(l) + "\n")
def test_unsupported_allele_length_combination(self): """ Tests default functions needs GLPK installed :return: """ epi_pred = EpitopePredictorFactory("Syfpeithi") cl_pred = CleavageSitePredictorFactory("PCM") alleles = [ Allele("HLA-A*02:01", prob=0.5), Allele("HLA-A*26:01", prob=0.5) ] sbws = EpitopeAssemblyWithSpacer(self.epis, cl_pred, epi_pred, alleles, solver="cbc") sol = sbws.solve() print sol assert all(i == str(j) for i, j in zip(["GHRMAWDMM", "HH", "VYEADDVIL"], sol))
def test_wrong_internal_to_external_version(self): with self.assertRaises(RuntimeError): EpitopePredictorFactory("NetMHC", version="0.1").predict(self.peptides_mhcI, alleles=self.mhcI)
def test_epitope_prediction_no_version(self): print EpitopePredictorFactory("BIMAS").predict(self.peptides_mhcI, self.mhcI)
def main(): parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)" ) parser.add_argument("-a", "--alleles", required=True, help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument("-k","--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument("-al","--alpha", default=0.99, type=float, help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)") parser.add_argument("-be","--beta", default=0.0, type=float, help="Specifies the second-order preference of the user in the model [0,1] (default 0).") parser.add_argument("-cp","--cleavage_prediction", default="PCM", help="Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument("-ep","--epitope_prediction", default="Syfpeithi", help="Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument("-thr","--threshold", default=20, type=float, help="Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument("-t", "--threads", type=int, default=None, help="Specifies number of threads. If not specified all available logical cpus are used.") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in ["PCM", "PROTEASMM_C", "PROTEASMM_S"]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in ["SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name:args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles, k=args.max_length,en=9,threshold=thr, solver="cplex", alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads svbws = solver.approximate(threads=threads,options={"preprocessing_presolve":"n","threads":1}) print print "Resulting String-of-Beads: ","-".join(map(str,svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str,svbws)))
def test_multiple_predictors_names_different_version(self): self.assertTrue( EpitopePredictorFactory("BIMAS", version="1.0").version == "1.0") self.assertTrue( EpitopePredictorFactory("BIMAS", version="2.0").version == "2.0")
def test_epitope_prediction_unsupported_version(self): print EpitopePredictorFactory("BIMAS", version="4.0").predict( self.peptides_mhcI, self.mhcI)
def main(): model = argparse.ArgumentParser( description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method') model.add_argument('-v', '--vcf', type=str, default=None, help='Path to the vcf input file') model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help= 'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument('-p', '--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)') model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides') model.add_argument( '-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)') model.add_argument( '-r', '--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.') model.add_argument( '-fINDEL', '--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)') model.add_argument('-fFS', '--filterFSINDEL', action="store_true", help='Filter frameshift INDELs') model.add_argument('-fSNP', '--filterSNP', action="store_true", help='Filter SNPs') model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file') model.add_argument('-etk', '--etk', action="store_true", help=argparse.SUPPRESS) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write( "At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter( lambda x: x.type not in [ VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS ], variants) if args.filterFSINDEL: variants = filter( lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write( "No variants left after filtering. Please refine your filtering criteria.\n" ) return -1 epitopes = filter( lambda x: any( x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id, coding in v.coding.iteritems(): if coding.geneID != None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id( l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence( ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[ EAdapterFields.TRANSID]] = l.strip() proteins.append( Protein( protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) + "\tAntigen ID\t" + var_column + "\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join( set([ transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins() ])) vars_str = "" if args.vcf is not None: vars_str = "\t" + "|".join( set( prot_id.split(":FRED2")[0] + ":" + ",".join( repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write( str(p) + "\t" + method + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + vars_str + "\n") if args.etk: with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join( set([ transcript_to_genes[prot.transcript_id.split( ":FRED2")[0]] for prot in p.get_all_proteins() ])) g.write( str(p) + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + "\n") return 0
def __main__(): parser = argparse.ArgumentParser( "Write out information about supported models by Fred2 for installed predictor tool versions." ) parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length", type=int) parser.add_argument('-ml', "--min_length", help="Minimum peptide length", type=int) parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True, type=str) parser.add_argument('-t', '--tools', help='Tools requested for peptide predictions', required=True, type=str) parser.add_argument('-v', '--versions', help='<Required> File with used software versions.', required=True) args = parser.parse_args() selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [(row[0].split()[0], str(row[1])) for row in csv.reader(versions_file, delimiter=":")] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # how to handle this? # get for each method the corresponding tool version methods = { method.strip(): version.strip() for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } # get the alleles alleles = [Allele(a) for a in args.alleles.split(";")] peptide_lengths = [] if (args.peptides): peptides = read_peptide_input(args.peptides) peptide_lengths = set([len(pep) for pep in peptides]) else: peptide_lengths = range(args.min_length, args.max_length + 1) with open("model_report.txt", 'w') as output: # check if requested tool versions are supported for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[ method.lower()]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # check if requested alleles are supported support_all_alleles = True no_allele_support = True for a in alleles: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if a not in sorted(predictor.supportedAlleles): output.write("Allele " + convert_allele_back(a) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.\n") logger.warning( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.") support_all_alleles = False else: no_allele_support = False if support_all_alleles: output.write( "All selected alleles are supported by at least one of the requested tools.\n" ) if no_allele_support: output.write( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." ) output.write("\n") # check if requested lengths are supported support_all_lengths = True no_length_support = True for l in peptide_lengths: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if l not in sorted(predictor.supportedLength): output.write("Peptide length " + str(l) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Peptide length " + str(l) + " is not supported by any of the requested tools.\n") logger.warning( "Peptide length " + str(l) + " is not supported by any of the requested tools.") support_all_lengths = False else: no_length_support = False if support_all_lengths: output.write( "All selected or provided peptide lengths are supported by at least one of the requested tools.\n" ) if no_length_support: output.write( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." )
def make_predictions_from_peptides(peptides, methods, alleles, protein_db, identifier, metadata): # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} # list to hold dataframes for all predictions pred_dataframes = [] # filter out self peptides if specified selfies = [str(p) for p in peptides if protein_db.exists(str(p))] peptides_filtered = [p for p in peptides if str(p) not in selfies] # sort peptides by length (for predictions) sorted_peptides = {} for p in peptides_filtered: length = len(str(p)) if length in sorted_peptides: sorted_peptides[length].append(p) else: sorted_peptides[length] = [p] for peplen in sorted_peptides: all_peptides_filtered = sorted_peptides[peplen] results = [] for m in methods: try: results.extend([ EpitopePredictorFactory(m.split('-')[0], version=m.split('-')[1]).predict( all_peptides_filtered, alleles=alleles) ]) except: logging.warning( "Prediction for length {length} and allele {allele} not possible with {method}. No model available." .format(length=peplen, allele=','.join([str(a) for a in alleles]), method=m)) # merge dataframes of the performed predictions if (len(results) == 0): continue df = results[0].merge_results(results[1:]) df.insert(0, 'length', df.index.map(create_length_column_value)) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score( conv_allele, peplen) # reset index to have index as columns df.reset_index(inplace=True) mandatory_columns = [ 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'synonymous', 'homozygous', 'variant details (genomic)', 'variant details (protein)' ] for header in mandatory_columns: if header not in metadata: df[header] = np.nan else: df[header] = df.apply( lambda row: row[0].get_metadata(header)[0], axis=1) for c in list(set(metadata) - set(mandatory_columns)): df[c] = df.apply(lambda row: row[0].get_metadata(c)[0], axis=1) for c in df.columns: if '*' in str(c): idx = df.columns.get_loc(c) df.insert( idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values( str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) df.insert( idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values( float(x['%s affinity' % c]), x['Method']), axis=1)) df = df.rename(columns={c: '%s score' % c}) df = df.rename(columns={'Seq': 'sequence'}) df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) # write prediction statistics statistics = { 'prediction_methods': methods, 'number_of_variants': '-', 'number_of_peptides': len(peptides), 'number_of_peptides_after_filtering': len(peptides_filtered) } return pred_dataframes, statistics
def make_predictions_from_variants(variants_all, methods, alleles, minlength, maxlength, martsadapter, protein_db, identifier, metadata, transcriptProteinMap): # list for all peptides and filtered peptides all_peptides = [] all_peptides_filtered = [] # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} # list to hold dataframes for all predictions pred_dataframes = [] prots = [ p for p in generator.generate_proteins_from_transcripts( generator.generate_transcripts_from_variants( variants_all, martsadapter, ID_SYSTEM_USED)) ] for peplen in range(minlength, maxlength): peptide_gen = generator.generate_peptides_from_proteins(prots, peplen) peptides_var = [x for x in peptide_gen] # remove peptides which are not 'variant relevant' peptides = [ x for x in peptides_var if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # filter out self peptides selfies = [str(p) for p in peptides if protein_db.exists(str(p))] filtered_peptides = [p for p in peptides if str(p) not in selfies] all_peptides = all_peptides + peptides all_peptides_filtered = all_peptides_filtered + filtered_peptides results = [] if len(filtered_peptides) > 0: for m in methods: try: results.extend([ EpitopePredictorFactory( m.split('-')[0], version=m.split('-')[1]).predict(filtered_peptides, alleles=alleles) ]) except: logging.warning( "Prediction for length {length} and allele {allele} not possible with {method}." .format(length=peplen, allele=','.join([str(a) for a in alleles]), method=m)) if (len(results) == 0): continue df = results[0].merge_results(results[1:]) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score( conv_allele, peplen) df.insert(0, 'length', df.index.map(create_length_column_value)) df['chr'] = df.index.map(create_variant_chr_column_value) df['pos'] = df.index.map(create_variant_pos_column_value) df['gene'] = df.index.map(create_gene_column_value) df['transcripts'] = df.index.map(create_transcript_column_value) df['proteins'] = df.index.map(create_protein_column_value) df['variant type'] = df.index.map(create_variant_type_column_value) df['synonymous'] = df.index.map(create_variant_syn_column_value) df['homozygous'] = df.index.map(create_variant_hom_column_value) df['variant details (genomic)'] = df.index.map( create_mutationsyntax_genome_column_value) df['variant details (protein)'] = df.index.map( create_mutationsyntax_column_value) # reset index to have index as columns df.reset_index(inplace=True) for c in df.columns: if '*' in str(c): idx = df.columns.get_loc(c) df.insert( idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values( str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) df.insert( idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values( float(x['%s affinity' % c]), x['Method']), axis=1)) df = df.rename(columns={c: '%s score' % c}) df['%s score' % c] = df['%s score' % c].map(lambda x: round(x, 4)) for c in metadata: df[c] = df.apply(lambda row: create_metadata_column_value(row, c), axis=1) df = df.rename(columns={'Seq': 'sequence'}) df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) statistics = { 'prediction_methods': methods, 'number_of_variants': len(variants_all), 'number_of_peptides': len(all_peptides), 'number_of_peptides_after_filtering': len(all_peptides_filtered) } return pred_dataframes, statistics, all_peptides_filtered
def main(): parser = argparse.ArgumentParser( description= 'The software is a novel approach to construct epitope-based string-of-beads \ vaccines in optimal order and with sequence-optimized spacers of flexible length \ such that the recovery of contained epitopes is maximized and immunogenicity of \ arising neo-epitopes is reduced.', ) parser.add_argument('-i', "--input", required=True, help="File containing epitopes (one peptide per line)", type=str) parser.add_argument( '-a', "--alleles", required=True, help= "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)", type=str) #parameters of the model parser.add_argument( '-l', "--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)", ) parser.add_argument( '-al', "--alpha", default=0.99, type=float, help= "Specifies the first-order preference of the user in the model [0,1] (default 0.99)", ) parser.add_argument( '-be', "--beta", default=0.0, type=float, help= "Specifies the second-order preference of the user in the model [0,1] (default 0).", ) parser.add_argument( '-cp', "--cleavage_prediction", default="pcm", choices=["pcm", "proteasmm_c", "proteasmm_i"], help= "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_I]", type=str) parser.add_argument( '-ep', "--epitope_prediction", default="syfpeithi", choices=["syfpeithi", "smm", "smmpmbec", "bimas"], help= "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]", type=str) parser.add_argument( '-t', "--threshold", default=20, type=float, help= "Specifies epitope prediction threshold for SYFPEITHI (default 20).", ) parser.add_argument( '-o', "--output", required=True, type=str, help="Specifies the output file.", ) parser.add_argument( '-p', "--threads", type=int, default=1, help= "Specifies number of threads. If not specified all available logical cpus are used.", ) parser.add_argument( '-apx', "--approximate", action="store_true", help= "Specifies number of threads. If not specified all available logical cpus are used.", ) args = parser.parse_args() #parse input peptides = read_lines(args.input) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in [ "PCM", "PROTEASMM_C", "PROTEASMM_I" ]: sys.stderr.write( "Specified cleavage predictor is currently not supported. \ Please choose either PCM, PROTEASMM_C, or PROTEASMM_I" ) sys.exit(-1) if args.epitope_prediction.upper() not in [ "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC" ]: sys.stderr.write( "Specified cleavage predictor is currently not supported. \ Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC") sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name: args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides, cl_pred, epi_pred, alleles, k=args.max_length, en=9, threshold=thr, solver="cbc", alpha=args.alpha, beta=args.beta, verbosity=1) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n #TODO:CBC should be shipped with the node #TODO: has to be tested with CBC #TODO: LHK has to be shipped as well -> only academic license! #"preprocess":"off", "threads":1} threads = mp.cpu_count() if args.threads is None else args.threads if args.approximate: svbws = solver.approximate(threads=threads, options={ "preprocess": "off", "threads": 1 }) if not svbws: svbws = solver.solve(threads=threads, options={ "preprocess": "off", "threads": 1 }) else: svbws = solver.solve(threads=threads, options={ "preprocess": "off", "threads": 1 }) with open(args.output, "w") as f: f.write(">assembled_spacer_design\n") f.write("".join(map(str, svbws))) return 0
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument( '-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument( '-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]". format(m=PRED_METH)) parser.add_argument( '-f', "--filter", dest="filter", type=float, help= "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict" ) parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument( '-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig( filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org" ) # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants( vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values( )[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [ x for x in peptides if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set( FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format( f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format( f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str( e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def main(): #Specify CTD interface # Every CTD Model has to have at least a name and a version, plus any of the optional attributes below them. model = argparse.ArgumentParser(description='Process some integers.') model.add_argument('-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method' ) model.add_argument('-v', '--version', type=str, default="", help='The version of the prediction method' ) model.add_argument('-i', '--input', type=str, required=True, help='Path to the input file' ) model.add_argument('-t', '--type', choices=["fasta","peptide"], type=str, default="fasta", help='The data type of the input (fasta, peptide list)' ) model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides' ) model.add_argument('-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)' ) model.add_argument('-op', '--options', type=str, default="", help="Additional options that get directly past to the tool" ) model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file' ) args = model.parse_args() #fasta protein if args.type == "fasta": with open(args.input, 'r') as f: first_line = f.readline() sep_pos = 1 if first_line.count("|") else 0 proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos) peptides = generate_peptides_from_proteins(proteins, args.length) elif args.type == "peptide": peptides = read_lines(args.input, in_type=Peptide) else: sys.stderr.write('Input type not known\n') return -1 #read in alleles alleles = read_lines(args.alleles, in_type=Allele) if args.version == "": result = EpitopePredictorFactory(args.method).predict(peptides, alleles, options=args.options) else: result = EpitopePredictorFactory(args.method, version=args.version).predict(peptides, alleles, options=args.options) #write to TSV columns sequence method allele-scores...,protein-id/transcript-id with open(args.output, "w") as f: proteins = "\tAntigen ID" if args.type == "fasta" else "" alleles = result.columns f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+proteins+"\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = "\t"+",".join( prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else "" f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+proteins+"\n") return 0
def main(): parser = argparse.ArgumentParser( description= """The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)") parser.add_argument( "-a", "--alleles", required=True, help= "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument( "-k", "--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument( "-al", "--alpha", default=0.99, type=float, help= "Specifies the first-order preference of the user in the model [0,1] (default 0.99)" ) parser.add_argument( "-be", "--beta", default=0.0, type=float, help= "Specifies the second-order preference of the user in the model [0,1] (default 0)." ) parser.add_argument( "-cp", "--cleavage_prediction", default="PCM", help= "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument( "-ep", "--epitope_prediction", default="Syfpeithi", help= "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument( "-thr", "--threshold", default=20, type=float, help= "Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument( "-t", "--threads", type=int, default=None, help= "Specifies number of threads. If not specified all available logical cpus are used." ) parser.add_argument( "--ips-solver", default="cplex", choices=["cplex", "cbc"], help= "Executable name of the IPS solver. Executable needs to be available in PATH." ) parser.add_argument("--tsp-solution", default="approximate", choices=["approximate", "optimal"], help="Type of solution of the TSP") parser.add_argument( "--random-order", action="store_true", help= "Indicate whether to generate a random ordered string-of-beads polypeptide" ) parser.add_argument( "--seed", type=int, default=1, help="Seed for random ordering of string-of-beads polypeptide") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in [ "PCM", "PROTEASMM_C", "PROTEASMM_S" ]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in [ "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC" ]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name: args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides, cl_pred, epi_pred, alleles, k=args.max_length, en=9, threshold=thr, solver=args.ips_solver, alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads if args.tsp_solution == "approximate": svbws = solver.approximate(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) else: svbws = solver.solve(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) # Generate random ordered string-of-breads, but still uses optimal spacers # determined from the above solve function. if args.random_order: print "Generating a randomly ordered polypeptide" random.seed(args.seed) random_order_sob = [] random.shuffle(peptides) for i in range(len(peptides)): # Break from loop once we hit the last peptide if i == len(peptides) - 1: random_order_sob.extend([Peptide(str(peptides[i]))]) break left_peptide = str(peptides[i]) right_peptide = str(peptides[i + 1]) opt_spacer = solver.spacer[(left_peptide, right_peptide)] # Right peptide gets added in the next iteration random_order_sob.extend( [Peptide(left_peptide), Peptide(opt_spacer)]) svbws = random_order_sob print print "Resulting String-of-Beads: ", "-".join(map(str, svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str, svbws)))
def test_wrong_allele_input(self): with self.assertRaises(ValueError): EpitopePredictorFactory("NetMHC").predict(self.mhcI, alleles=self.transcript)