예제 #1
0
 def test_translate(self):
     gcg_var = "A*NADEQSEWEREVICNKNSLFTDEKFILSA*YL*G*TELESI*KQCALVQKYRA*DTEHIKSSQRGLALSSPALFYSTLPEDSRNEKHLLCGWIICNAGTRQLATFPSRHRGEIQIILSFPGRPTQ*S*SDERGQAPFTGHIHQ*LQQVSGLQACPRFCAVVDEYQEEQE*HCQTSR*I*ETC*RDLYQ*CKFLFGRPSCQGIHCLAGERPRKARFPRRGRHC*RTWPQTC*WFFL**DEHHS**SCRQGLYKLVDSDQNH*QEITISLFKIIFTTSPASHVGCLKC*VL*I*EVYSEATLLCMPINKFSFSVV*PKITNGIKFYQNIAKISALKYESARFCYFLLILDEVPQPVYI*R*NYFSMI*FVNVNYSDLTYLHYNNRRIEELVATVVKLERELSS*NLCLKNTQLSMYQRYN*IKFSSFFTIV"
     self.assertTrue(
         str(
             next(
                 generate_proteins_from_transcripts(
                     self.w_v, to_stop=False))) == gcg_var)
예제 #2
0
def main():
    model = argparse.ArgumentParser(
        description='Neoepitope prediction for TargetInspector.')

    model.add_argument(
        '-m',
        '--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method')

    model.add_argument('-v',
                       '--vcf',
                       type=str,
                       default=None,
                       help='Path to the vcf input file')

    model.add_argument(
        '-t',
        '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SNPEFF"],
        default="VEP",
        help=
        'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
    )

    model.add_argument('-p',
                       '--proteins',
                       type=str,
                       default=None,
                       help='Path to the protein ID input file (in HGNC-ID)')

    model.add_argument('-minl',
                       '--peptide_min_length',
                       type=int,
                       default=8,
                       help='Minimum peptide length for epitope prediction')

    model.add_argument('-maxl',
                       '--peptide_max_length',
                       type=int,
                       default=12,
                       help='Maximum peptide length for epitope prediction')

    model.add_argument(
        '-a',
        '--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)')

    model.add_argument(
        '-r',
        '--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for variant annotation and calling.')

    model.add_argument(
        '-fINDEL',
        '--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)')

    model.add_argument('-fFS',
                       '--filterFSINDEL',
                       action="store_true",
                       help='Filter frameshift INDELs')

    model.add_argument('-fSNP',
                       '--filterSNP',
                       action="store_true",
                       help='Filter SNPs')

    model.add_argument('-etk',
                       '--etk',
                       action="store_true",
                       help=argparse.SUPPRESS)

    model.add_argument('-bind',
                       '--predict_bindings',
                       action="store_true",
                       help='Predict bindings')

    model.add_argument('-o',
                       '--output',
                       type=str,
                       required=True,
                       help='Path to the output file')

    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])
    transcript_to_genes = {}

    if args.vcf is None and args.proteins is None:
        sys.stderr.write(
            "At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)
        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf,
                                                     gene_filter=protein_ids)
        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]
        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(
                lambda x: x.type not in [
                    VariationType.INS, VariationType.DEL, VariationType.FSDEL,
                    VariationType.FSINS
                ], variants)

        if args.filterFSINDEL:
            variants = filter(
                lambda x: x.type not in
                [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write(
                "No variants left after filtering. Please refine your filtering criteria.\n"
            )
            return -1

        epitopes = []
        minlength = args.peptide_min_length
        maxlength = args.peptide_max_length
        prots = [
            p for p in generate_proteins_from_transcripts(
                generate_transcripts_from_variants(variants, martDB,
                                                   EIdentifierTypes.ENSEMBL))
        ]
        for peplen in range(minlength, maxlength + 1):
            peptide_gen = generate_peptides_from_proteins(prots, peplen)

            peptides_var = [x for x in peptide_gen]

            # remove peptides which are not 'variant relevant'
            peptides = [
                x for x in peptides_var if any(
                    x.get_variants_by_protein(y) for y in x.proteins.keys())
            ]
            epitopes.extend(peptides)

        for v in variants:
            for trans_id, coding in v.coding.iteritems():
                if coding.geneID is not None:
                    transcript_to_genes[trans_id] = coding.geneID
                else:
                    transcript_to_genes[trans_id] = 'None'

    # else: generate protein sequences from given HGNC IDs and then epitopes
    else:
        proteins = []
        with open(args.proteins, "r") as f:
            for l in f:
                ensembl_ids = martDB.get_ensembl_ids_from_id(
                    l.strip(), type=EIdentifierTypes.HGNC)[0]
                protein_seq = martDB.get_product_sequence(
                    ensembl_ids[EAdapterFields.PROTID])
                if protein_seq is not None:
                    transcript_to_genes[ensembl_ids[
                        EAdapterFields.TRANSID]] = l.strip()
                    proteins.append(
                        Protein(
                            protein_seq,
                            gene_id=l.strip(),
                            transcript_id=ensembl_ids[EAdapterFields.TRANSID]))
        epitopes = []
        for length in range(args.peptide_min_length, args.peptide_max_length):
            epitopes.extend(generate_peptides_from_proteins(proteins, length))

    # read in allele list
    alleles = read_lines(args.alleles, in_type=Allele)

    # predict bindings for all found neoepitopes
    if args.predict_bindings:
        result = EpitopePredictorFactory(args.method).predict(epitopes,
                                                              alleles=alleles)

        with open(args.output, "w") as f:
            alleles = result.columns
            var_column = " Variants" if args.vcf is not None else ""
            f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) +
                    "\tAntigen ID\t" + var_column + "\n")
            for index, row in result.iterrows():
                p = index[0]
                method = index[1]
                proteins = ",".join(
                    set([
                        transcript_to_genes[prot.transcript_id.split(
                            ":FRED2")[0]] for prot in p.get_all_proteins()
                    ]))
                vars_str = ""

                if args.vcf is not None:
                    vars_str = "\t" + "|".join(
                        set(
                            prot_id.split(":FRED2")[0] + ":" + ",".join(
                                repr(v) for v in set(
                                    p.get_variants_by_protein(prot_id)))
                            for prot_id in p.proteins.iterkeys()
                            if p.get_variants_by_protein(prot_id)))

                f.write(
                    str(p) + "\t" + method + "\t" +
                    "\t".join("%.3f" % row[a] for a in alleles) + "\t" +
                    proteins + vars_str + "\n")

        if args.etk:
            with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g:
                alleles = result.columns
                g.write("Alleles:\t" + "\t".join(a.name
                                                 for a in alleles) + "\n")
                for index, row in result.iterrows():
                    p = index[0]
                    proteins = " ".join(
                        set([
                            transcript_to_genes[prot.transcript_id.split(
                                ":FRED2")[0]] for prot in p.get_all_proteins()
                        ]))
                    g.write(
                        str(p) + "\t" + "\t".join("%.3f" % row[a]
                                                  for a in alleles) + "\t" +
                        proteins + "\n")
    # don't predict bindings!
    # different output format!
    else:
        with open(args.output, "w") as f:
            var_column = " Variants" if args.vcf is not None else ""
            f.write("Sequence\tAntigen ID\t" + var_column + "\n")

            for epitope in epitopes:
                p = epitope
                proteins = ",".join(
                    set([
                        transcript_to_genes[prot.transcript_id.split(
                            ":FRED2")[0]] for prot in p.get_all_proteins()
                    ]))
                vars_str = ""

                if args.vcf is not None:
                    vars_str = "\t" + "|".join(
                        set(
                            prot_id.split(":FRED2")[0] + ":" + ",".join(
                                repr(v) for v in set(
                                    p.get_variants_by_protein(prot_id)))
                            for prot_id in p.proteins.iterkeys()
                            if p.get_variants_by_protein(prot_id)))

                f.write(str(p) + "\t" + proteins + vars_str + "\n")

        with open(args.output.replace('.csv', '.txt'), "w") as f:
            for epitope in epitopes:
                f.write(str(epitope) + "\n")

    return 0
예제 #3
0
def main():

    model = argparse.ArgumentParser(
        description='Neoepitope protein fasta generation from variant vcf')

    model.add_argument('-v',
                       '--vcf',
                       type=str,
                       default=None,
                       help='Path to the vcf input file')

    model.add_argument(
        '-t',
        '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SnpEff"],
        default="VEP",
        help=
        'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
    )

    model.add_argument('-p',
                       '--proteins',
                       type=str,
                       default=None,
                       help='Path to the protein ID input file (in HGNC-ID)')

    model.add_argument(
        '-r',
        '--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for varinat annotation and calling.')

    model.add_argument(
        '-fINDEL',
        '--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)')

    model.add_argument('-fFS',
                       '--filterFSINDEL',
                       action="store_true",
                       help='Filter frameshift INDELs')

    model.add_argument('-fSNP',
                       '--filterSNP',
                       action="store_true",
                       help='Filter SNPs')

    model.add_argument('-o',
                       '--output',
                       type=str,
                       required=True,
                       help='Path to the output file')

    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])

    if args.vcf is None:
        sys.stderr.write(
            "At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)

        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf,
                                                     gene_filter=protein_ids)

        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]

        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)

        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(
                lambda x: x.type not in [
                    VariationType.INS, VariationType.DEL, VariationType.FSDEL,
                    VariationType.FSINS
                ], variants)

        if args.filterFSINDEL:
            variants = filter(
                lambda x: x.type not in
                [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write(
                "No variants left after filtering. Please refine your filtering criteria.\n"
            )
            return -1

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        #generate transcripts
        transcripts = generate_transcripts_from_variants(
            variants, martDB, EIdentifierTypes.ENSEMBL)

        #generate proteins
        proteins = filter(
            lambda x: any(
                x.get_variants_by_protein(tid)
                for tid in x.proteins.iterkeys()),
            generate_proteins_from_transcripts(transcripts))

        #write fasta file
        with open(output, "w") as f:
            for p in proteins:
                f.write('>' + str(p.transcript_id) + '|' + str(p.vars) +
                        '_var_' + '\n')
                f.write(str(p) + '\n')

    else:
        sys.stderr.write(
            "At least a vcf file or a protein id file has to be provided.\n")
        return -1

    return 0
예제 #4
0
 def test_translate(self):
     gcg_var = "A*NADEQSEWEREVICNKNSLFTDEKFILSA*YL*G*TELESI*KQCALVQKYRA*DTEHIKSSQRGLALSSPALFYSTLPEDSRNEKHLLCGWIICNAGTRQLATFPSRHRGEIQIILSFPGRPTQ*S*SDERGQAPFTGHIHQ*LQQVSGLQACPRFCAVVDEYQEEQE*HCQTSR*I*ETC*RDLYQ*CKFLFGRPSCQGIHCLAGERPRKARFPRRGRHC*RTWPQTC*WFFL**DEHHS**SCRQGLYKLVDSDQNH*QEITISLFKIIFTTSPASHVGCLKC*VL*I*EVYSEATLLCMPINKFSFSVV*PKITNGIKFYQNIAKISALKYESARFCYFLLILDEVPQPVYI*R*NYFSMI*FVNVNYSDLTYLHYNNRRIEELVATVVKLERELSS*NLCLKNTQLSMYQRYN*IKFSSFFTIV"
     self.assertTrue(str(generate_proteins_from_transcripts(self.w_v, to_stop=False).next()) == gcg_var)
예제 #5
0
def main():

    model = argparse.ArgumentParser(description='Neoepitope protein fasta generation from variant vcf')

    model.add_argument(
        '-v', '--vcf',
        type=str,
        default=None,
        help='Path to the vcf input file'
        )

    model.add_argument(
        '-t', '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SNPEFF"],
        default="VEP",
        help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
        )

    model.add_argument(
        '-p','--proteins',
        type=str,
        default=None,
        help='Path to the protein ID input file (in HGNC-ID)'
        )

    model.add_argument(
        '-r' ,'--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for varinat annotation and calling.'
        )

    model.add_argument(
        '-fINDEL' ,'--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)'
        )

    model.add_argument(
        '-fFS' ,'--filterFSINDEL',
        action="store_true",
        help='Filter frameshift INDELs'
        )

    model.add_argument(
        '-fSNP' ,'--filterSNP',
        action="store_true",
        help='Filter SNPs'
        )

    model.add_argument(
        '-o','--output',
        type=str,
        required=True,
        help='Path to the output file'
        )


    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])


    if args.vcf is None:
        sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)

        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids)

        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]

        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)


        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(lambda x: x.type not in [VariationType.INS,
                                                       VariationType.DEL,
                                                       VariationType.FSDEL,
                                                       VariationType.FSINS], variants)

        if args.filterFSINDEL:
            variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n")
            return -1

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        #generate transcripts
        transcripts = generate_transcripts_from_variants(variants, martDB, EIdentifierTypes.ENSEMBL)

        #generate proteins
        proteins = generate_proteins_from_transcripts(transcripts)

        #write fasta file
        with open(args.output, "w") as f:
            for p in proteins:
                f.write('>' + str(p.transcript_id) + '|' + str(p.vars) + '_var_' + '\n')
                f.write(str(p)+ '\n')

    else:
        sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
        return -1

    return 0