Beispiel #1
0
def main():

    model = argparse.ArgumentParser(
        description='Neoepitope prediction for TargetInsepctor.')

    model.add_argument(
        '-m',
        '--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method')

    model.add_argument('-v',
                       '--vcf',
                       type=str,
                       default=None,
                       help='Path to the vcf input file')

    model.add_argument(
        '-t',
        '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SNPEFF"],
        default="VEP",
        help=
        'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
    )

    model.add_argument('-p',
                       '--proteins',
                       type=str,
                       default=None,
                       help='Path to the protein ID input file (in HGNC-ID)')

    model.add_argument('-l',
                       '--length',
                       choices=range(8, 18),
                       type=int,
                       default=9,
                       help='The length of peptides')

    model.add_argument(
        '-a',
        '--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)')

    model.add_argument(
        '-r',
        '--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for varinat annotation and calling.')

    model.add_argument(
        '-fINDEL',
        '--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)')

    model.add_argument('-fFS',
                       '--filterFSINDEL',
                       action="store_true",
                       help='Filter frameshift INDELs')

    model.add_argument('-fSNP',
                       '--filterSNP',
                       action="store_true",
                       help='Filter SNPs')

    model.add_argument('-o',
                       '--output',
                       type=str,
                       required=True,
                       help='Path to the output file')
    model.add_argument('-etk',
                       '--etk',
                       action="store_true",
                       help=argparse.SUPPRESS)

    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])
    transcript_to_genes = {}

    if args.vcf is None and args.proteins is None:
        sys.stderr.write(
            "At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)
        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf,
                                                     gene_filter=protein_ids)
        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]
        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(
                lambda x: x.type not in [
                    VariationType.INS, VariationType.DEL, VariationType.FSDEL,
                    VariationType.FSINS
                ], variants)

        if args.filterFSINDEL:
            variants = filter(
                lambda x: x.type not in
                [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write(
                "No variants left after filtering. Please refine your filtering criteria.\n"
            )
            return -1

        epitopes = filter(
            lambda x: any(
                x.get_variants_by_protein(tid)
                for tid in x.proteins.iterkeys()),
            generate_peptides_from_variants(variants, int(args.length), martDB,
                                            EIdentifierTypes.ENSEMBL))

        for v in variants:
            for trans_id, coding in v.coding.iteritems():
                if coding.geneID != None:
                    transcript_to_genes[trans_id] = coding.geneID
                else:
                    transcript_to_genes[trans_id] = 'None'

    #else: generate protein sequences from given HGNC IDs and than epitopes
    else:
        proteins = []
        with open(args.proteins, "r") as f:
            for l in f:
                ensembl_ids = martDB.get_ensembl_ids_from_id(
                    l.strip(), type=EIdentifierTypes.HGNC)[0]
                protein_seq = martDB.get_product_sequence(
                    ensembl_ids[EAdapterFields.PROTID])
                if protein_seq is not None:
                    transcript_to_genes[ensembl_ids[
                        EAdapterFields.TRANSID]] = l.strip()
                    proteins.append(
                        Protein(
                            protein_seq,
                            gene_id=l.strip(),
                            transcript_id=ensembl_ids[EAdapterFields.TRANSID]))
        epitopes = generate_peptides_from_proteins(proteins, int(args.length))

    #read in allele list
    alleles = read_lines(args.alleles, in_type=Allele)

    result = EpitopePredictorFactory(args.method).predict(epitopes,
                                                          alleles=alleles)

    with open(args.output, "w") as f:
        alleles = result.columns
        var_column = " Variants" if args.vcf is not None else ""
        f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) +
                "\tAntigen ID\t" + var_column + "\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins = ",".join(
                set([
                    transcript_to_genes[prot.transcript_id.split(":FRED2")[0]]
                    for prot in p.get_all_proteins()
                ]))
            vars_str = ""

            if args.vcf is not None:
                vars_str = "\t" + "|".join(
                    set(
                        prot_id.split(":FRED2")[0] + ":" + ",".join(
                            repr(v)
                            for v in set(p.get_variants_by_protein(prot_id)))
                        for prot_id in p.proteins.iterkeys()
                        if p.get_variants_by_protein(prot_id)))

            f.write(
                str(p) + "\t" + method + "\t" + "\t".join("%.3f" % row[a]
                                                          for a in alleles) +
                "\t" + proteins + vars_str + "\n")

    if args.etk:
        with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g:
            alleles = result.columns
            g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n")
            for index, row in result.iterrows():
                p = index[0]
                proteins = " ".join(
                    set([
                        transcript_to_genes[prot.transcript_id.split(
                            ":FRED2")[0]] for prot in p.get_all_proteins()
                    ]))
                g.write(
                    str(p) + "\t" + "\t".join("%.3f" % row[a]
                                              for a in alleles) + "\t" +
                    proteins + "\n")
    return 0
    for _id, seq in SimpleFastaParser(file):
        # generate element:
        _id = _id.split(" ")[0]

        try:
            collect.add(Protein(seq.strip().upper(), transcript_id=_id))
        except TypeError:
            collect.add(Protein(seq.strip().upper()))
    return list(collect)


proteins = read_protein_fasta(args.input)

c = 0
for k in range(args.min_length, args.max_length + 1):
    peptides = generate_peptides_from_proteins(proteins, k)
    # get proteins and corresponding counts
    pd_peptides = pd.DataFrame([(str(pep), ','.join(
        [prot.transcript_id.split(' ')[0]
         for prot in pep.get_all_proteins()]), ','.join([
             str(len(pep.proteinPos[prot.transcript_id]))
             for prot in pep.get_all_proteins()
         ])) for pep in peptides],
                               columns=['sequence', 'protein_ids', 'counts'])
    # assign id
    pd_peptides = pd_peptides.assign(
        id=[str(c + id) for id in pd_peptides.index])
    c += len(pd_peptides['sequence'])

    if k == args.min_length:
        pd_peptides[['sequence', 'id', 'protein_ids',
Beispiel #3
0
def main():
    model = argparse.ArgumentParser(
        description='Commandline tool for TAP prediction',
        )

    model.add_argument('-m',
        '--method',
        type=str,
        choices=TAPPredictorFactory.available_methods().keys(),
        default="svmtap",
        help='The name of the prediction method'
        )

    model.add_argument('-v',
        '--version',
        type=str,
        default="",
        help='The version of the prediction method'
        )

    model.add_argument('-i',
        '--input',
        type=str,
        required=True,
        help='Path to the input file'
        )

    model.add_argument('-t',
        '--type',
        choices=["fasta", "peptide"],
        type=str,
        default="fasta",
        help='The data type of the input (fasta, peptide list)'
        )

    model.add_argument('-l',
        '--length',
        type=int,
        default=9,
        help='The length of peptides'
        )

    model.add_argument('-op',
        '--options',
        type=str,
        default="",
        help="Additional options that get directly past to the tool"
    )

    model.add_argument('-o',
        '--output',
        type=str,
        required=True,
        help='Path to the output file'
        )

    args = model.parse_args()

    #fasta protein
    if args.type == "fasta":
        with open(args.input, 'r') as f:
            first_line = f.readline()
        sep_pos = 1 if first_line.count("|") else 0
        proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos)
        peptides = generate_peptides_from_proteins(proteins, int(args.length))
    elif args.type == "peptide":
        peptides = read_lines(args.input, in_type=Peptide)
    else:
        sys.stderr.write('Input type not known\n')
        return -1

    if args.version == "":
        result = TAPPredictorFactory(args.method).predict(peptides, options=args.options)
    else:
        result = TAPPredictorFactory(args.method, version=args.version).predict(peptides, options=args.options)

    #write to TSV columns sequence method score...,protein-id/transcript-id
    with open(args.output, "w") as f:
        proteins = "\tProtein ID" if args.type == "fasta" else ""
        f.write("Sequence\tMethod\t"+"Score"+proteins+"\n")
        for index, row in result.iterrows():
            p = index
            proteins = ",".join(prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else ""
            f.write(str(p)+"\t"+"\t".join("%s\t%.3f"%(method, score) for
                                          method, score in row.iteritems())+"\t"+proteins+"\n")
    return 0
def main():

    model = argparse.ArgumentParser(description='Neoepitope prediction for TargetInsepctor.')

    model.add_argument(
        '-m','--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method'
        )


    model.add_argument(
        '-v', '--vcf',
        type=str,
        default=None,
        help='Path to the vcf input file'
        )

    model.add_argument(
        '-t', '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SNPEFF"],
        default="VEP",
        help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
        )

    model.add_argument(
        '-p','--proteins',
        type=str,
        default=None,
        help='Path to the protein ID input file (in HGNC-ID)'
        )

    model.add_argument(
        '-l','--length',
        choices=range(8, 18),
        type=int,
        default=9,
        help='The length of peptides'
        )

    model.add_argument(
        '-a','--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)'
        )

    model.add_argument(
        '-r' ,'--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for varinat annotation and calling.'
        )

    model.add_argument(
        '-fINDEL' ,'--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)'
        )

    model.add_argument(
        '-fFS' ,'--filterFSINDEL',
        action="store_true",
        help='Filter frameshift INDELs'
        )

    model.add_argument(
        '-fSNP' ,'--filterSNP',
        action="store_true",
        help='Filter SNPs'
        )

    model.add_argument(
        '-o','--output',
        type=str,
        required=True,
        help='Path to the output file'
        )
    model.add_argument(
        '-etk','--etk',
        action="store_true",
        help=argparse.SUPPRESS
        )

    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])
    transcript_to_genes = {}

    if args.vcf is None and args.proteins is None:
        sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)
        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids)
        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]
        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(lambda x: x.type not in [VariationType.INS,
                                                       VariationType.DEL,
                                                       VariationType.FSDEL,
                                                       VariationType.FSINS], variants)

        if args.filterFSINDEL:
            variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n")
            return -1

        epitopes = filter(lambda x:any(x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()),
                        generate_peptides_from_variants(variants,
                                                int(args.length), martDB, EIdentifierTypes.ENSEMBL))

        for v in variants:
            for trans_id,coding in v.coding.iteritems():
                if coding.geneID!=None:
                   transcript_to_genes[trans_id] = coding.geneID
                else:
                   transcript_to_genes[trans_id] = 'None'



    #else: generate protein sequences from given HGNC IDs and than epitopes
    else:
        proteins = []
        with open(args.proteins, "r") as f:
            for l in f:
                ensembl_ids = martDB.get_ensembl_ids_from_id(l.strip(), type=EIdentifierTypes.HGNC)[0]
                protein_seq = martDB.get_product_sequence(ensembl_ids[EAdapterFields.PROTID])
                if protein_seq is not None:
                    transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip()
                    proteins.append(Protein(protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID]))
        epitopes = generate_peptides_from_proteins(proteins, int(args.length))


    #read in allele list
    alleles = read_lines(args.alleles, in_type=Allele)

    result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles)

    with open(args.output, "w") as f:
        alleles = result.columns
        var_column = " Variants" if args.vcf is not None else ""
        f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+"\tAntigen ID\t"+var_column+"\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins = ",".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()]))
            vars_str = ""

            if args.vcf is not None:
                vars_str = "\t"+"|".join(set(prot_id.split(":FRED2")[0]+":"+",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id)))
                                                                            for prot_id in p.proteins.iterkeys()
                                          if p.get_variants_by_protein(prot_id)))
            
            f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+vars_str+"\n")

    if args.etk:
        with open(args.output.rsplit(".",1)[0]+"_etk.tsv", "w") as g:
            alleles = result.columns
            g.write("Alleles:\t"+"\t".join(a.name for a in alleles)+"\n")
            for index, row in result.iterrows():
                p = index[0]
                proteins = " ".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()]))
                g.write(str(p)+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+"\n")
    return 0
def main():
    #Specify CTD interface
    # Every CTD Model has to have at least a name and a version, plus any of the optional attributes below them.
    model = argparse.ArgumentParser(description='Process some integers.')

    model.add_argument('-m',
        '--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method'
        )

    model.add_argument('-v',
        '--version',
        type=str,
        default="",
        help='The version of the prediction method'
        )

    model.add_argument('-i',
        '--input',
        type=str,
        required=True,
        help='Path to the input file'
        )

    model.add_argument('-t',
        '--type',
        choices=["fasta","peptide"],
        type=str,
        default="fasta",
        help='The data type of the input (fasta, peptide list)'
        )

    model.add_argument('-l',
        '--length',
        choices=range(8, 18),
        type=int,
        default=9,
        help='The length of peptides'
        )

    model.add_argument('-a',
        '--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)'
        )

    model.add_argument('-op',
        '--options',
        type=str,
        default="",
        help="Additional options that get directly past to the tool"
    )

    model.add_argument('-o',
        '--output',
        type=str,
        required=True,
        help='Path to the output file'
        )

    args = model.parse_args()


    #fasta protein
    if args.type == "fasta":
        with open(args.input, 'r') as f:
            first_line = f.readline()
        sep_pos = 1 if first_line.count("|") else 0
        proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos)
        peptides = generate_peptides_from_proteins(proteins, args.length)
    elif args.type == "peptide":
        peptides = read_lines(args.input, in_type=Peptide)
    else:
        sys.stderr.write('Input type not known\n')
        return -1

    #read in alleles
    alleles = read_lines(args.alleles, in_type=Allele)
    if args.version == "":
        result = EpitopePredictorFactory(args.method).predict(peptides, alleles, options=args.options)
    else:
        result = EpitopePredictorFactory(args.method, version=args.version).predict(peptides, alleles,
                                                                 options=args.options)

    #write to TSV columns sequence method allele-scores...,protein-id/transcript-id
    with open(args.output, "w") as f:
        proteins = "\tAntigen ID" if args.type == "fasta" else ""
        alleles = result.columns
        f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+proteins+"\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins =  "\t"+",".join( prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else ""
            f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+proteins+"\n")

    return 0
Beispiel #6
0
        alleles = None
    file_in = arguments["--input"]
    file_out = arguments["--output"]

    print("read fasta")
    proteins = read_fasta(file_in, id_position=0, in_type=Protein)

    # restrict to only top N proteins if provided
    if arguments["--top_N"]:
        Nargs = int(arguments["--top_N"])
        N = min(Nargs, len(proteins))
        proteins = proteins[0:N]

    # parse peptide/protein information from Peptide list and Protein list
    print("setup peptide/protein information table")
    peptides = generate_peptides_from_proteins(proteins, PEPTIDE_LENGTH)
    peptides_list = [peptide for peptide in peptides]
    proteins_list = [peptide.proteins.keys()[0] for peptide in peptides_list]
    peptides_str_list = [peptide.tostring() for peptide in peptides_list]
    peptides_position_list = [
        peptide.proteinPos.items()[0][1][0] for peptide in peptides_list
    ]
    dt_peptides = pd.DataFrame({
        "peptide": peptides_str_list,
        "peptide_position": peptides_position_list,
        "transcript_id": proteins_list
    })

    # predict the effect for each unique peptide
    print("predict the effects")
    res = fred2wrap.predict_peptide_effects(peptides_list, alleles=alleles)