Esempio n. 1
0
    def test_simple_incorporation(self):
        """
        test simple variant incorporation. only 1 variant in 1 transcript.
        input reference transcript: AAAAACCCCCGGGGG

        variant 3: insert TT after pos 7

        variant 1: SNP C -> T at pos 2

        variant 4: del CCCCC after pos 9
        """
        dummy_db = DummyAdapter()

        # INSERTIONS:
        dummy_vars = [var_3]
        trans = Generator.generate_transcripts_from_variants(
            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()
        self.assertEqual(str(trans), "AAAAACCTTCCCGGGGG")

        # SNPs:
        dummy_vars = [var_1]
        trans = Generator.generate_transcripts_from_variants(
            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()
        self.assertEqual(str(trans), "ATAAACCCCCGGGGG")

        # DELETIONS:
        dummy_vars = [var_4]
        trans = Generator.generate_transcripts_from_variants(
            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()
        self.assertEqual(str(trans), "AAAAAGGGGG")
Esempio n. 2
0
    def test_peptides_from_varaints(self):
        coding = {}
        coding['NM_080751'] = MutationSyntax('NM_080751',2629,876,'c.2630C>T','p.Pro877Leu')
        var = Variant('line0',0,20,2621905,'C','T',coding,True,False)
        var.gene = 'TMC2'
        ma = MartsAdapter(biomart="http://ensembl.org")

        vars = [var, Variant("testInsertion", 2, 20, 2621899, "", "AAAAAA", {'NM_080751':MutationSyntax('NM_080751',2625,876,'c.2630C>T','p.Pro877Leu')}, True, False)]

        test = Generator.generate_peptides_from_variants(vars, 9, ma, id_type=EIdentifierTypes.REFSEQ, peptides=None)
        test2 = [x for x in test]
        print(len(test2))

        ts = list()
        #using a tweaked generator that takes another sequence source if the sequence is too short in respect to the given variants
        #in this case a newer/older sequence from mart in respect to what was given as reference in the annotation process
        t = Generator.generate_transcripts_from_variants(vars, ma, id_type=EIdentifierTypes.REFSEQ)
        ts = [x for x in t]
        print(len(ts[0]))
        p = Generator.generate_proteins_from_transcripts(ts, to_stop=True)
        ps = [x for x in p]
        e = Generator.generate_peptides_from_proteins(ps, 9)
        es = [x for x in e]
        print(len(es))

        #print vars
        print len(vars)
Esempio n. 3
0
    def test_simple_incorporation(self):
        """
        test simple variant incorporation. only 1 variant in 1 transcript.
        input reference transcript: AAAAACCCCCGGGGG

        variant 3: insert TT after pos 7

        variant 1: SNP C -> T at pos 2

        variant 4: del CCCCC after pos 9
        """
        dummy_db = DummyAdapter()

        # INSERTIONS:
        dummy_vars = [var_3]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next()
        self.assertEqual(str(trans), "AAAAACCTTCCCGGGGG")

        # SNPs:
        dummy_vars = [var_1]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next()
        self.assertEqual(str(trans), "ATAAACCCCCGGGGG")

        # DELETIONS:
        dummy_vars = [var_4]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next()
        self.assertEqual(str(trans), "AAAAAGGGGG")
Esempio n. 4
0
    def test_real_life_test(self):
        mart = MartsAdapter(biomart="http://grch37.ensembl.org/biomart/martservice?query=")

        ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)), "Data/examples/test_annovar.out")
        vars = read_annovar_exonic(ano_path)

        peps = set(map(lambda x: str(x), Generator.generate_peptides_from_variants(vars, 9, mart,
                                                                                   EIdentifierTypes.REFSEQ)))

        peps_from_prot = set(map(str, Generator.generate_peptides_from_proteins(
            Generator.generate_proteins_from_transcripts(
            Generator.generate_transcripts_from_variants(vars, mart, EIdentifierTypes.REFSEQ)), 9)))

        self.assertTrue(len(peps - peps_from_prot) == 0)
        self.assertTrue(len(peps_from_prot - peps) == 0)
Esempio n. 5
0
    def test_proteins_from_variants(self):
        """
                Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:set(['GLK', 'PPK', 'GFP', 'PQK', 'GFK', 'GGF', 'FPQ', 'FPP', 'GGL'])
        AAATTTCGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTCCGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)


        Resulting protein sequences:
        KFG
        KNLG
        KFPPG
        KNFPRG

        GFK
        GGLK
        GFPPK
        GGFPQK
        """
        dummy_db = DummyAdapter()
        dummy_vars = [var_10, var_11, var_12]
        exp_prot = set([
            'KFG', 'KNLG', 'KFPPG', 'KNFPRG', 'GFK', 'GGLK', 'GFPPK', 'GFPPK',
            'GGFPQK'
        ])
        prot = set(
            map(
                lambda x: str(x),
                Generator.generate_proteins_from_transcripts(
                    Generator.generate_transcripts_from_variants(
                        dummy_vars, dummy_db, EIdentifierTypes.REFSEQ))))
        self.assertTrue(len(prot - exp_prot) == 0)
        self.assertTrue(len(exp_prot - prot) == 0)
Esempio n. 6
0
    def test_offset_single(self):
        """
        tests if offset is correctly handled when several variants for one
        transcript occur. still only one transcript with one transcript variant.
        reference transcript: AAAAACCCCCGGGGG

        Each variant so that it is clearly down stream of
        it's predecessor

        """
        dummy_db = DummyAdapter()

        # 1) INS, SNP, DEL
        dummy_vars = [var_3, var_7, var_6]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next()

        self.assertEqual(str(trans), "AAAAACCTTCTGGGG")

        # 2.) INS, DEL, INS
        dummy_vars = [var_9, var_4, var_8]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next()
        self.assertEqual(str(trans), "AATTAAAGGGGGTTT")
Esempio n. 7
0
    def test_offset_single(self):
        """
        tests if offset is correctly handled when several variants for one
        transcript occur. still only one transcript with one transcript variant.
        reference transcript: AAAAACCCCCGGGGG

        Each variant so that it is clearly down stream of
        it's predecessor

        """
        dummy_db = DummyAdapter()

        # 1) INS, SNP, DEL
        dummy_vars = [var_3, var_7, var_6]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()

        self.assertEqual(str(trans), "AAAAACCTTCTGGGG")

        # 2.) INS, DEL, INS
        dummy_vars = [var_9, var_4, var_8]
        trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next()
        self.assertEqual(str(trans), "AATTAAAGGGGGTTT")
Esempio n. 8
0
    def test_proteins_from_variants(self):
        """
                Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:set(['GLK', 'PPK', 'GFP', 'PQK', 'GFK', 'GGF', 'FPQ', 'FPP', 'GGL'])
        AAATTTCGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTCCGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)


        Resulting protein sequences:
        KFG
        KNLG
        KFPPG
        KNFPRG

        GFK
        GGLK
        GFPPK
        GGFPQK
        """
        dummy_db = DummyAdapter()
        dummy_vars = [var_10, var_11, var_12]
        exp_prot = set(['KFG', 'KNLG', 'KFPPG', 'KNFPRG', 'GFK', 'GGLK', 'GFPPK', 'GFPPK', 'GGFPQK'])
        prot = set(map(lambda x: str(x),
                       Generator.generate_proteins_from_transcripts(
                           Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)))
                   )
        self.assertTrue(len(prot-exp_prot) == 0)
        self.assertTrue(len(exp_prot-prot) == 0)
Esempio n. 9
0
    def test_non_syn_hetero_snp_trans_number(self):
        """
        tests if the number of generated transcripts for a heterozygous
        transcript is correct

        1 hetero vars = 2 transcripts
        :return:
        """
        vars_ = \
            [self.non_syn_hetero_snp, self.non_frame_shift_del, self.syn_homo_snp]

        trans = \
            [t for t in Generator.generate_transcripts_from_variants(vars_, self.db_adapter, EIdentifierTypes.REFSEQ)]

        self.assertTrue(len(trans) == 2**sum(not v.isHomozygous for v in vars_))
Esempio n. 10
0
    def test_non_syn_hetero_snp_trans_number(self):
        """
        tests if the number of generated transcripts for a heterozygous
        transcript is correct

        1 hetero vars = 2 transcripts
        :return:
        """
        vars_ = \
            [self.non_syn_hetero_snp, self.non_frame_shift_del,self.syn_homo_snp]

        trans = \
            [t for t in Generator.generate_transcripts_from_variants(vars_, self.db_adapter)]

        self.assertTrue(len(trans) == 2**sum(not v.isHomozygous for v in vars_))
Esempio n. 11
0
    def test_real_life_test(self):
        mart = MartsAdapter(
            biomart="http://grch37.ensembl.org/biomart/martservice?query=")

        ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)),
                                "Data/examples/test_annovar.out")
        vars = read_annovar_exonic(ano_path)

        peps = set(
            map(
                lambda x: str(x),
                Generator.generate_peptides_from_variants(
                    vars, 9, mart, EIdentifierTypes.REFSEQ)))

        peps_from_prot = set(
            map(
                str,
                Generator.generate_peptides_from_proteins(
                    Generator.generate_proteins_from_transcripts(
                        Generator.generate_transcripts_from_variants(
                            vars, mart, EIdentifierTypes.REFSEQ)), 9)))

        self.assertTrue(len(peps - peps_from_prot) == 0)
        self.assertTrue(len(peps_from_prot - peps) == 0)
Esempio n. 12
0
    def test_heterozygous_variants(self):
        """
        Create multiple transcript variants for a transcript, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTCGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTCCGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)
        """

        dummy_db = DummyAdapter()

        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        trans_gener = Generator.generate_transcripts_from_variants(
            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)
        trans = [t for t in trans_gener]

        trans = map(str, trans)

        self.assertEqual(len(trans), 8)

        self.assertTrue("AAATTTGGGGG" in trans)
        self.assertTrue("AAAAATTTGGGGG" in trans)
        self.assertTrue("AAATTTCCCCCGGGGG" in trans)
        self.assertTrue("AAAAATTTCCCCCGGGGG" in trans)

        self.assertTrue("GGGTTTAAAAA" in trans)
        self.assertTrue("GGGGGTTTAAAAA" in trans)
        self.assertTrue("GGGTTTCCCCCAAAAA" in trans)
        self.assertTrue("GGGGGTTTCCCCCAAAAA" in trans)
Esempio n. 13
0
    def test_heterozygous_variants(self):
        """
        Create multiple transcript variants for a transcript, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTCGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTCCGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)
        """

        dummy_db = DummyAdapter()

        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        trans_gener = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db)
        trans = [t for t in trans_gener]

        trans = map(str, trans)

        self.assertEqual(len(trans), 8)

        self.assertTrue("AAATTTGGGGG" in trans)
        self.assertTrue("AAAAATTTGGGGG" in trans)
        self.assertTrue("AAATTTCCCCCGGGGG" in trans)
        self.assertTrue("AAAAATTTCCCCCGGGGG" in trans)

        self.assertTrue("GGGTTTAAAAA" in trans)
        self.assertTrue("GGGGGTTTAAAAA" in trans)
        self.assertTrue("GGGTTTCCCCCAAAAA" in trans)
        self.assertTrue("GGGGGTTTCCCCCAAAAA" in trans)
Esempio n. 14
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V',
                        '--variations',
                        dest="var_file",
                        help='<Required> full path to the input variations',
                        required=True)
    parser.add_argument('-o',
                        "--outfile",
                        dest="outfile_path",
                        help="Created fasta file",
                        required=True)
    parser.add_argument(
        '-d',
        "--digest",
        dest="digest",
        type=int,
        help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a',
                        "--alleles",
                        dest="alleles",
                        help="Input alleles for prediction")
    parser.add_argument(
        '-p',
        "--predict",
        dest="predict_with",
        help="Method of prediction, needs alleles & length, allowed:[{m}]".
        format(m=PRED_METH))
    parser.add_argument(
        '-f',
        "--filter",
        dest="filter",
        type=float,
        help=
        "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict"
    )
    parser.add_argument('-P',
                        "--Proteins",
                        dest="only_proteins",
                        action='store_true',
                        help="Will write only proteins.")
    parser.add_argument(
        '-b',
        "--base",
        dest="basefasta_path",
        help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(
        filename=os.path.splitext(options.outfile_path)[0] +
        "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
        filemode='w+',
        level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path +
                 " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

    # MyObject = type('MyObject', (object,), {})
    # options = MyObject()
    # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
    #
    # vt = os.path.splitext(options.var_file)[-1]
    # if ".vcf" == vt:
    #     vcfvars, accessions = FileReader.read_vcf(options.var_file)
    #
    # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
    #
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # transcripts = [x for x in transcript_gen if x.vars]
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    # proteins = [x for x in protein_gen if x.vars]
    # for p in proteins:
    #     p.gene_id = p.vars.values()[0][0].gene
    #
    #
    # for t in transcripts:
    #     t.gene_id = t.vars.values()[0].gene
    #

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org"
                           )  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(
        vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values(
        )[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins
                if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [
        x for x in peptides if any(
            x.get_variants_by_protein(y) for y in x.proteins.keys())
    ]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(
            FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(
            f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(
            f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(
            e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Esempio n. 15
0
 def test__check_for_problematic_variants(self):
     self.assertTrue(
         Generator._check_for_problematic_variants([var_2, var_1]))
     self.assertFalse(
         Generator._check_for_problematic_variants([var_5, var_6]))
Esempio n. 16
0
 def test__incorp_deletion(self):
     ts = list("TESTSEQUEASDFGNCES")
     self.assertEqual(Generator._incorp_deletion(ts, var_4, "tsc_1", 0, 0),
                      -5)
     self.assertEqual(Generator._incorp_deletion(ts, var_6, "tsc_1", 0, 0),
                      -2)
Esempio n. 17
0
 def test__incorp_insertion(self):
     ts = list("TESTSEQUENCE")
     self.assertEqual(Generator._incorp_insertion(ts, var_3, "tsc_1", 0, 0),
                      2)
Esempio n. 18
0
 def test__incorp_snp(self):
     ts = list("TESTSEQUENCE")
     self.assertEqual(Generator._incorp_snp(ts, var_2, "tsc_1", 6, 6), 6)
Esempio n. 19
0
 def test__incorp_deletion(self):
     ts = list("TESTSEQUEASDFGNCES")
     self.assertEqual(Generator._incorp_deletion(ts, var_4, "tsc_1", 0, 0),  -5)
     self.assertEqual(Generator._incorp_deletion(ts, var_6, "tsc_1",0, 0),  -2)
Esempio n. 20
0
    def test_peptides_from_variants(self):
        """
        Create multiple peptides, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTGGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)


        Resulting protein sequences:
        KFG
        KNLG
        KFPPG
        KNFPRG

        GFK
        GGLK
        GFPPK
        GGFPQK

        Resulting peptides of length 3:
        KFG +
        KNL +
        NLG +
        KFP +
        FPP +
        PPG +
        KNF +
        NFP +
        FPR +
        PRG +

        GFK +
        GGL +
        GLK +
        GFP +
        FPP +
        PPK +
        GGF +
        GFP +
        FPQ +
        PQK +
        """
        dummy_db = DummyAdapter()

        exp_peps = set([
            'PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL',
            'KFG', 'GGF', 'FPQ', 'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP'
        ])
        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        peps = set(
            map(
                lambda x: str(x),
                Generator.generate_peptides_from_variants(
                    dummy_vars, 3, dummy_db, EIdentifierTypes.REFSEQ)))

        peps_from_prot = set(
            map(
                str,
                Generator.generate_peptides_from_proteins(
                    Generator.generate_proteins_from_transcripts(
                        Generator.generate_transcripts_from_variants(
                            dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)),
                    3)))

        self.assertTrue(len(peps - peps_from_prot) == 0)
        self.assertTrue(len(peps_from_prot - peps) == 0)
        self.assertTrue(len(peps - exp_peps) == 0)
        self.assertTrue(len(exp_peps - peps) == 0)
Esempio n. 21
0
 def test__incorp_snp(self):
     ts = list("TESTSEQUENCE")
     print self.assertEqual(Generator._incorp_snp(ts, var_2, "tsc_1", 6, 6), 6)
Esempio n. 22
0
def __main__():
    parser = argparse.ArgumentParser(
        description=
        """Individualized Proteins 2.0 \n Script for generation of protein sequences which contain provided variants.""",
        version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument(
        '-i',
        "--identifier",
        help="<Required> Predictions will be written with this name prefix",
        required=True)
    parser.add_argument(
        '-r',
        "--reference",
        help=
        "Reference, retrieved information will be based on this ensembl version",
        required=False,
        default='GRCh37',
        choices=['GRCh37', 'GRCh38'])
    parser.add_argument(
        '-db',
        "--database",
        help=
        "Proteome sequence reference database to be attached to individualized sequences",
        required=True)
    parser.add_argument('-o',
                        "--output_dir",
                        help="All files written will be put in this directory")

    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    logging.basicConfig(filename=os.path.join(
        args.output_dir, '{}_indproteinsDB.log'.format(args.identifier)),
                        filemode='w+',
                        level=logging.DEBUG)
    logging.info("Starting generation of protein sequences at " +
                 str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    if args.output_dir is not None:
        try:
            os.chdir(args.output_dir)
            logging.info("Using provided data directory: {}".format(
                str(args.output_dir)))
        except:
            logging.info("No such directory, using current.")
    else:
        logging.info("Using current data directory.")
    '''start the actual IRMA functions'''
    metadata = []
    #references = {'GRCh37': 'http://grch37.ensembl.org', 'GRCh38': 'http://ensembl.org'}
    references = {
        'GRCh37': 'http://feb2014.archive.ensembl.org',
        'GRCh38': 'http://dec2016.archive.ensembl.org'
    }
    global transcriptProteinMap
    '''read in variants'''
    if args.somatic_mutations.endswith(
            '.GSvar') or args.somatic_mutations.endswith('.tsv'):
        vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
    elif args.somatic_mutations.endswith('.vcf'):
        vl, transcripts = read_vcf(args.somatic_mutations)

    if args.germline_mutations is not None:
        if args.germline_mutations.endswith(
                '.GSvar') or args.germline_mutations.endswith('.tsv'):
            vl_normal, transcripts_germline, metadata = read_GSvar(
                args.germline_mutations)
        elif args.germline_mutations.endswith('.vcf'):
            vl_normal, transcripts_germline = read_vcf(args.germline_mutations)

        # combine germline and somatic variants
        vl = vl + vl_normal
        transcripts = transcripts_germline + transcripts
    transcripts = list(set(transcripts))

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    #generate transcripts containing variants, filter for unmutated sequences
    transcripts = [
        g for g in generator.generate_transcripts_from_variants(
            vl, ma, ID_SYSTEM_USED) if g.vars
    ]
    #generate proteins from transcripts, table='Standard', stop_symbol='*', to_stop=True, cds=False
    proteins = generator.generate_proteins_from_transcripts(transcripts)
    diff_sequences = {}

    out_ref = args.database.split('/')[-1].replace(
        '.fasta',
        '_{}_individualized_protein_DB.fasta'.format(args.identifier))

    cpRef = 'cp {f} {o}'.format(f=args.database, o=out_ref)
    subprocess.call(cpRef.split())

    with open(out_ref, 'a') as outfile:
        for p in proteins:

            variants = []
            for v in p.vars:
                variants = variants + p.vars[v]

            c = [x.coding.values() for x in variants]
            cf = list(itertools.chain.from_iterable(c))

            cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
            aas = ','.join([y.aaMutationSyntax for y in set(cf)])

            outfile.write('>{}:{}\n'.format(p.transcript_id, aas))
            outfile.write('{}\n'.format(str(p)))

    logging.info("Finished generation of protein sequences at " +
                 str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
Esempio n. 23
0
 def test__check_for_problematic_variants(self):
     self.assertTrue(Generator._check_for_problematic_variants([var_2, var_1]))
     self.assertFalse(Generator._check_for_problematic_variants([var_5, var_6]))
Esempio n. 24
0
def make_predictions_from_variants(variants_all, methods, alleles, minlength,
                                   maxlength, martsadapter, protein_db,
                                   identifier, metadata, transcriptProteinMap):
    # list for all peptides and filtered peptides
    all_peptides = []
    all_peptides_filtered = []

    # dictionaries for syfpeithi matrices max values and allele mapping
    max_values_matrices = {}
    allele_string_map = {}

    # list to hold dataframes for all predictions
    pred_dataframes = []

    prots = [
        p for p in generator.generate_proteins_from_transcripts(
            generator.generate_transcripts_from_variants(
                variants_all, martsadapter, ID_SYSTEM_USED))
    ]

    for peplen in range(minlength, maxlength):
        peptide_gen = generator.generate_peptides_from_proteins(prots, peplen)

        peptides_var = [x for x in peptide_gen]

        # remove peptides which are not 'variant relevant'
        peptides = [
            x for x in peptides_var if any(
                x.get_variants_by_protein(y) for y in x.proteins.keys())
        ]

        # filter out self peptides
        selfies = [str(p) for p in peptides if protein_db.exists(str(p))]
        filtered_peptides = [p for p in peptides if str(p) not in selfies]

        all_peptides = all_peptides + peptides
        all_peptides_filtered = all_peptides_filtered + filtered_peptides

        results = []

        if len(filtered_peptides) > 0:
            for m in methods:
                try:
                    results.extend([
                        EpitopePredictorFactory(
                            m.split('-')[0],
                            version=m.split('-')[1]).predict(filtered_peptides,
                                                             alleles=alleles)
                    ])
                except:
                    logging.warning(
                        "Prediction for length {length} and allele {allele} not possible with {method}."
                        .format(length=peplen,
                                allele=','.join([str(a) for a in alleles]),
                                method=m))

        if (len(results) == 0):
            continue

        df = results[0].merge_results(results[1:])

        for a in alleles:
            conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype)
            allele_string_map['%s_%s' %
                              (a, peplen)] = '%s_%i' % (conv_allele, peplen)
            max_values_matrices['%s_%i' %
                                (conv_allele, peplen)] = get_matrix_max_score(
                                    conv_allele, peplen)

        df.insert(0, 'length', df.index.map(create_length_column_value))
        df['chr'] = df.index.map(create_variant_chr_column_value)
        df['pos'] = df.index.map(create_variant_pos_column_value)
        df['gene'] = df.index.map(create_gene_column_value)
        df['transcripts'] = df.index.map(create_transcript_column_value)
        df['proteins'] = df.index.map(create_protein_column_value)
        df['variant type'] = df.index.map(create_variant_type_column_value)
        df['synonymous'] = df.index.map(create_variant_syn_column_value)
        df['homozygous'] = df.index.map(create_variant_hom_column_value)
        df['variant details (genomic)'] = df.index.map(
            create_mutationsyntax_genome_column_value)
        df['variant details (protein)'] = df.index.map(
            create_mutationsyntax_column_value)

        # reset index to have index as columns
        df.reset_index(inplace=True)

        for c in df.columns:
            if '*' in str(c):
                idx = df.columns.get_loc(c)
                df.insert(
                    idx + 1, '%s affinity' % c,
                    df.apply(lambda x: create_affinity_values(
                        str(c), int(x['length']), float(x[c]), x['Method'],
                        max_values_matrices, allele_string_map),
                             axis=1))
                df.insert(
                    idx + 2, '%s binder' % c,
                    df.apply(lambda x: create_binder_values(
                        float(x['%s affinity' % c]), x['Method']),
                             axis=1))
                df = df.rename(columns={c: '%s score' % c})
                df['%s score' % c] = df['%s score' %
                                        c].map(lambda x: round(x, 4))

        for c in metadata:
            df[c] = df.apply(lambda row: create_metadata_column_value(row, c),
                             axis=1)

        df = df.rename(columns={'Seq': 'sequence'})
        df = df.rename(columns={'Method': 'method'})
        pred_dataframes.append(df)

    statistics = {
        'prediction_methods': methods,
        'number_of_variants': len(variants_all),
        'number_of_peptides': len(all_peptides),
        'number_of_peptides_after_filtering': len(all_peptides_filtered)
    }

    return pred_dataframes, statistics, all_peptides_filtered
Esempio n. 25
0
    def test_peptides_from_variants(self):
        """
        Create multiple peptides, given a set
        containing heterozygous variants .

        Variants:
        3-DEL(-2)  , 5-INS(+3)  , 7-DEL(-4)
        HET-DEL(-2), HOM-INS(+3), HET-DEL(-1)

        Reference sequence:
        AAAAACCCCCGGGGG
        AAATTTGGGGG (DEL,INS,DEL)
        AAATTTCCCCCGGGGG (DEL,INS)
        AAAAATTTGGGGG (INS,DEL)
        AAAAATTTCCCCCGGGGG (INS)

        GGGGGCCCCCAAAAA
        GGGTTTCAAAAA (DEL,INS,DEL)
        GGGTTTCCCCCAAAAA (DEL,INS)
        GGGGGTTTCAAAAA (INS,DEL)
        GGGGGTTTCCCCCAAAAA (INS)


        Resulting protein sequences:
        KFG
        KNLG
        KFPPG
        KNFPRG

        GFK
        GGLK
        GFPPK
        GGFPQK

        Resulting peptides of length 3:
        KFG +
        KNL +
        NLG +
        KFP +
        FPP +
        PPG +
        KNF +
        NFP +
        FPR +
        PRG +

        GFK +
        GGL +
        GLK +
        GFP +
        FPP +
        PPK +
        GGF +
        GFP +
        FPQ +
        PQK +
        """
        dummy_db = DummyAdapter()

        exp_peps = set(['PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL', 'KFG', 'GGF', 'FPQ',
                        'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP'])
        # 1) INS, SNP, DEL
        dummy_vars = [var_10, var_11, var_12]
        peps = set(map(lambda x: str(x), Generator.generate_peptides_from_variants(dummy_vars, 3, dummy_db)))

        self.assertTrue(len(peps-exp_peps) == 0)
        self.assertTrue(len(exp_peps-peps) == 0)
Esempio n. 26
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True)
    parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True)
    parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction")
    parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH))
    parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict")
    parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.")
    parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
                        filemode='w+', level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

# MyObject = type('MyObject', (object,), {})
# options = MyObject()
# setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
#
# vt = os.path.splitext(options.var_file)[-1]
# if ".vcf" == vt:
#     vcfvars, accessions = FileReader.read_vcf(options.var_file)
#
# mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
#
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# transcripts = [x for x in transcript_gen if x.vars]
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
# proteins = [x for x in protein_gen if x.vars]
# for p in proteins:
#     p.gene_id = p.vars.values()[0][0].gene
#
#
# for t in transcripts:
#     t.gene_id = t.vars.values()[0].gene
#

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values()[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Esempio n. 27
0
 def test__incorp_insertion(self):
     ts = list("TESTSEQUENCE")
     self.assertEqual(Generator._incorp_insertion(ts, var_3, "tsc_1", 0, 0),  2)