def main(): model = argparse.ArgumentParser( description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method') model.add_argument('-v', '--vcf', type=str, default=None, help='Path to the vcf input file') model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help= 'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument('-p', '--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)') model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides') model.add_argument( '-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)') model.add_argument( '-r', '--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.') model.add_argument( '-fINDEL', '--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)') model.add_argument('-fFS', '--filterFSINDEL', action="store_true", help='Filter frameshift INDELs') model.add_argument('-fSNP', '--filterSNP', action="store_true", help='Filter SNPs') model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file') model.add_argument('-etk', '--etk', action="store_true", help=argparse.SUPPRESS) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write( "At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter( lambda x: x.type not in [ VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS ], variants) if args.filterFSINDEL: variants = filter( lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write( "No variants left after filtering. Please refine your filtering criteria.\n" ) return -1 epitopes = filter( lambda x: any( x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id, coding in v.coding.iteritems(): if coding.geneID != None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id( l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence( ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[ EAdapterFields.TRANSID]] = l.strip() proteins.append( Protein( protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) + "\tAntigen ID\t" + var_column + "\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join( set([ transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins() ])) vars_str = "" if args.vcf is not None: vars_str = "\t" + "|".join( set( prot_id.split(":FRED2")[0] + ":" + ",".join( repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write( str(p) + "\t" + method + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + vars_str + "\n") if args.etk: with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join( set([ transcript_to_genes[prot.transcript_id.split( ":FRED2")[0]] for prot in p.get_all_proteins() ])) g.write( str(p) + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + "\n") return 0
for _id, seq in SimpleFastaParser(file): # generate element: _id = _id.split(" ")[0] try: collect.add(Protein(seq.strip().upper(), transcript_id=_id)) except TypeError: collect.add(Protein(seq.strip().upper())) return list(collect) proteins = read_protein_fasta(args.input) c = 0 for k in range(args.min_length, args.max_length + 1): peptides = generate_peptides_from_proteins(proteins, k) # get proteins and corresponding counts pd_peptides = pd.DataFrame([(str(pep), ','.join( [prot.transcript_id.split(' ')[0] for prot in pep.get_all_proteins()]), ','.join([ str(len(pep.proteinPos[prot.transcript_id])) for prot in pep.get_all_proteins() ])) for pep in peptides], columns=['sequence', 'protein_ids', 'counts']) # assign id pd_peptides = pd_peptides.assign( id=[str(c + id) for id in pd_peptides.index]) c += len(pd_peptides['sequence']) if k == args.min_length: pd_peptides[['sequence', 'id', 'protein_ids',
def main(): model = argparse.ArgumentParser( description='Commandline tool for TAP prediction', ) model.add_argument('-m', '--method', type=str, choices=TAPPredictorFactory.available_methods().keys(), default="svmtap", help='The name of the prediction method' ) model.add_argument('-v', '--version', type=str, default="", help='The version of the prediction method' ) model.add_argument('-i', '--input', type=str, required=True, help='Path to the input file' ) model.add_argument('-t', '--type', choices=["fasta", "peptide"], type=str, default="fasta", help='The data type of the input (fasta, peptide list)' ) model.add_argument('-l', '--length', type=int, default=9, help='The length of peptides' ) model.add_argument('-op', '--options', type=str, default="", help="Additional options that get directly past to the tool" ) model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file' ) args = model.parse_args() #fasta protein if args.type == "fasta": with open(args.input, 'r') as f: first_line = f.readline() sep_pos = 1 if first_line.count("|") else 0 proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos) peptides = generate_peptides_from_proteins(proteins, int(args.length)) elif args.type == "peptide": peptides = read_lines(args.input, in_type=Peptide) else: sys.stderr.write('Input type not known\n') return -1 if args.version == "": result = TAPPredictorFactory(args.method).predict(peptides, options=args.options) else: result = TAPPredictorFactory(args.method, version=args.version).predict(peptides, options=args.options) #write to TSV columns sequence method score...,protein-id/transcript-id with open(args.output, "w") as f: proteins = "\tProtein ID" if args.type == "fasta" else "" f.write("Sequence\tMethod\t"+"Score"+proteins+"\n") for index, row in result.iterrows(): p = index proteins = ",".join(prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else "" f.write(str(p)+"\t"+"\t".join("%s\t%.3f"%(method, score) for method, score in row.iteritems())+"\t"+proteins+"\n") return 0
def main(): model = argparse.ArgumentParser(description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m','--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method' ) model.add_argument( '-v', '--vcf', type=str, default=None, help='Path to the vcf input file' ) model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument( '-p','--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)' ) model.add_argument( '-l','--length', choices=range(8, 18), type=int, default=9, help='The length of peptides' ) model.add_argument( '-a','--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)' ) model.add_argument( '-r' ,'--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.' ) model.add_argument( '-fINDEL' ,'--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)' ) model.add_argument( '-fFS' ,'--filterFSINDEL', action="store_true", help='Filter frameshift INDELs' ) model.add_argument( '-fSNP' ,'--filterSNP', action="store_true", help='Filter SNPs' ) model.add_argument( '-o','--output', type=str, required=True, help='Path to the output file' ) model.add_argument( '-etk','--etk', action="store_true", help=argparse.SUPPRESS ) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter(lambda x: x.type not in [VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS], variants) if args.filterFSINDEL: variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") return -1 epitopes = filter(lambda x:any(x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id,coding in v.coding.iteritems(): if coding.geneID!=None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id(l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence(ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip() proteins.append(Protein(protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+"\tAntigen ID\t"+var_column+"\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()])) vars_str = "" if args.vcf is not None: vars_str = "\t"+"|".join(set(prot_id.split(":FRED2")[0]+":"+",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+vars_str+"\n") if args.etk: with open(args.output.rsplit(".",1)[0]+"_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t"+"\t".join(a.name for a in alleles)+"\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()])) g.write(str(p)+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+"\n") return 0
def main(): #Specify CTD interface # Every CTD Model has to have at least a name and a version, plus any of the optional attributes below them. model = argparse.ArgumentParser(description='Process some integers.') model.add_argument('-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method' ) model.add_argument('-v', '--version', type=str, default="", help='The version of the prediction method' ) model.add_argument('-i', '--input', type=str, required=True, help='Path to the input file' ) model.add_argument('-t', '--type', choices=["fasta","peptide"], type=str, default="fasta", help='The data type of the input (fasta, peptide list)' ) model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides' ) model.add_argument('-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)' ) model.add_argument('-op', '--options', type=str, default="", help="Additional options that get directly past to the tool" ) model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file' ) args = model.parse_args() #fasta protein if args.type == "fasta": with open(args.input, 'r') as f: first_line = f.readline() sep_pos = 1 if first_line.count("|") else 0 proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos) peptides = generate_peptides_from_proteins(proteins, args.length) elif args.type == "peptide": peptides = read_lines(args.input, in_type=Peptide) else: sys.stderr.write('Input type not known\n') return -1 #read in alleles alleles = read_lines(args.alleles, in_type=Allele) if args.version == "": result = EpitopePredictorFactory(args.method).predict(peptides, alleles, options=args.options) else: result = EpitopePredictorFactory(args.method, version=args.version).predict(peptides, alleles, options=args.options) #write to TSV columns sequence method allele-scores...,protein-id/transcript-id with open(args.output, "w") as f: proteins = "\tAntigen ID" if args.type == "fasta" else "" alleles = result.columns f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+proteins+"\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = "\t"+",".join( prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else "" f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+proteins+"\n") return 0
alleles = None file_in = arguments["--input"] file_out = arguments["--output"] print("read fasta") proteins = read_fasta(file_in, id_position=0, in_type=Protein) # restrict to only top N proteins if provided if arguments["--top_N"]: Nargs = int(arguments["--top_N"]) N = min(Nargs, len(proteins)) proteins = proteins[0:N] # parse peptide/protein information from Peptide list and Protein list print("setup peptide/protein information table") peptides = generate_peptides_from_proteins(proteins, PEPTIDE_LENGTH) peptides_list = [peptide for peptide in peptides] proteins_list = [peptide.proteins.keys()[0] for peptide in peptides_list] peptides_str_list = [peptide.tostring() for peptide in peptides_list] peptides_position_list = [ peptide.proteinPos.items()[0][1][0] for peptide in peptides_list ] dt_peptides = pd.DataFrame({ "peptide": peptides_str_list, "peptide_position": peptides_position_list, "transcript_id": proteins_list }) # predict the effect for each unique peptide print("predict the effects") res = fred2wrap.predict_peptide_effects(peptides_list, alleles=alleles)