def test_simple_incorporation(self): """ test simple variant incorporation. only 1 variant in 1 transcript. input reference transcript: AAAAACCCCCGGGGG variant 3: insert TT after pos 7 variant 1: SNP C -> T at pos 2 variant 4: del CCCCC after pos 9 """ dummy_db = DummyAdapter() # INSERTIONS: dummy_vars = [var_3] trans = Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "AAAAACCTTCCCGGGGG") # SNPs: dummy_vars = [var_1] trans = Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "ATAAACCCCCGGGGG") # DELETIONS: dummy_vars = [var_4] trans = Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "AAAAAGGGGG")
def test_peptides_from_varaints(self): coding = {} coding['NM_080751'] = MutationSyntax('NM_080751',2629,876,'c.2630C>T','p.Pro877Leu') var = Variant('line0',0,20,2621905,'C','T',coding,True,False) var.gene = 'TMC2' ma = MartsAdapter(biomart="http://ensembl.org") vars = [var, Variant("testInsertion", 2, 20, 2621899, "", "AAAAAA", {'NM_080751':MutationSyntax('NM_080751',2625,876,'c.2630C>T','p.Pro877Leu')}, True, False)] test = Generator.generate_peptides_from_variants(vars, 9, ma, id_type=EIdentifierTypes.REFSEQ, peptides=None) test2 = [x for x in test] print(len(test2)) ts = list() #using a tweaked generator that takes another sequence source if the sequence is too short in respect to the given variants #in this case a newer/older sequence from mart in respect to what was given as reference in the annotation process t = Generator.generate_transcripts_from_variants(vars, ma, id_type=EIdentifierTypes.REFSEQ) ts = [x for x in t] print(len(ts[0])) p = Generator.generate_proteins_from_transcripts(ts, to_stop=True) ps = [x for x in p] e = Generator.generate_peptides_from_proteins(ps, 9) es = [x for x in e] print(len(es)) #print vars print len(vars)
def test_simple_incorporation(self): """ test simple variant incorporation. only 1 variant in 1 transcript. input reference transcript: AAAAACCCCCGGGGG variant 3: insert TT after pos 7 variant 1: SNP C -> T at pos 2 variant 4: del CCCCC after pos 9 """ dummy_db = DummyAdapter() # INSERTIONS: dummy_vars = [var_3] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next() self.assertEqual(str(trans), "AAAAACCTTCCCGGGGG") # SNPs: dummy_vars = [var_1] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next() self.assertEqual(str(trans), "ATAAACCCCCGGGGG") # DELETIONS: dummy_vars = [var_4] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next() self.assertEqual(str(trans), "AAAAAGGGGG")
def test_real_life_test(self): mart = MartsAdapter(biomart="http://grch37.ensembl.org/biomart/martservice?query=") ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)), "Data/examples/test_annovar.out") vars = read_annovar_exonic(ano_path) peps = set(map(lambda x: str(x), Generator.generate_peptides_from_variants(vars, 9, mart, EIdentifierTypes.REFSEQ))) peps_from_prot = set(map(str, Generator.generate_peptides_from_proteins( Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants(vars, mart, EIdentifierTypes.REFSEQ)), 9))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0)
def test_proteins_from_variants(self): """ Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence:set(['GLK', 'PPK', 'GFP', 'PQK', 'GFK', 'GGF', 'FPQ', 'FPP', 'GGL']) AAATTTCGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTCCGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) Resulting protein sequences: KFG KNLG KFPPG KNFPRG GFK GGLK GFPPK GGFPQK """ dummy_db = DummyAdapter() dummy_vars = [var_10, var_11, var_12] exp_prot = set([ 'KFG', 'KNLG', 'KFPPG', 'KNFPRG', 'GFK', 'GGLK', 'GFPPK', 'GFPPK', 'GGFPQK' ]) prot = set( map( lambda x: str(x), Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)))) self.assertTrue(len(prot - exp_prot) == 0) self.assertTrue(len(exp_prot - prot) == 0)
def test_offset_single(self): """ tests if offset is correctly handled when several variants for one transcript occur. still only one transcript with one transcript variant. reference transcript: AAAAACCCCCGGGGG Each variant so that it is clearly down stream of it's predecessor """ dummy_db = DummyAdapter() # 1) INS, SNP, DEL dummy_vars = [var_3, var_7, var_6] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next() self.assertEqual(str(trans), "AAAAACCTTCTGGGG") # 2.) INS, DEL, INS dummy_vars = [var_9, var_4, var_8] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db).next() self.assertEqual(str(trans), "AATTAAAGGGGGTTT")
def test_offset_single(self): """ tests if offset is correctly handled when several variants for one transcript occur. still only one transcript with one transcript variant. reference transcript: AAAAACCCCCGGGGG Each variant so that it is clearly down stream of it's predecessor """ dummy_db = DummyAdapter() # 1) INS, SNP, DEL dummy_vars = [var_3, var_7, var_6] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "AAAAACCTTCTGGGG") # 2.) INS, DEL, INS dummy_vars = [var_9, var_4, var_8] trans = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ).next() self.assertEqual(str(trans), "AATTAAAGGGGGTTT")
def test_proteins_from_variants(self): """ Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence:set(['GLK', 'PPK', 'GFP', 'PQK', 'GFK', 'GGF', 'FPQ', 'FPP', 'GGL']) AAATTTCGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTCCGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) Resulting protein sequences: KFG KNLG KFPPG KNFPRG GFK GGLK GFPPK GGFPQK """ dummy_db = DummyAdapter() dummy_vars = [var_10, var_11, var_12] exp_prot = set(['KFG', 'KNLG', 'KFPPG', 'KNFPRG', 'GFK', 'GGLK', 'GFPPK', 'GFPPK', 'GGFPQK']) prot = set(map(lambda x: str(x), Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ))) ) self.assertTrue(len(prot-exp_prot) == 0) self.assertTrue(len(exp_prot-prot) == 0)
def test_non_syn_hetero_snp_trans_number(self): """ tests if the number of generated transcripts for a heterozygous transcript is correct 1 hetero vars = 2 transcripts :return: """ vars_ = \ [self.non_syn_hetero_snp, self.non_frame_shift_del, self.syn_homo_snp] trans = \ [t for t in Generator.generate_transcripts_from_variants(vars_, self.db_adapter, EIdentifierTypes.REFSEQ)] self.assertTrue(len(trans) == 2**sum(not v.isHomozygous for v in vars_))
def test_non_syn_hetero_snp_trans_number(self): """ tests if the number of generated transcripts for a heterozygous transcript is correct 1 hetero vars = 2 transcripts :return: """ vars_ = \ [self.non_syn_hetero_snp, self.non_frame_shift_del,self.syn_homo_snp] trans = \ [t for t in Generator.generate_transcripts_from_variants(vars_, self.db_adapter)] self.assertTrue(len(trans) == 2**sum(not v.isHomozygous for v in vars_))
def test_real_life_test(self): mart = MartsAdapter( biomart="http://grch37.ensembl.org/biomart/martservice?query=") ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)), "Data/examples/test_annovar.out") vars = read_annovar_exonic(ano_path) peps = set( map( lambda x: str(x), Generator.generate_peptides_from_variants( vars, 9, mart, EIdentifierTypes.REFSEQ))) peps_from_prot = set( map( str, Generator.generate_peptides_from_proteins( Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants( vars, mart, EIdentifierTypes.REFSEQ)), 9))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0)
def test_heterozygous_variants(self): """ Create multiple transcript variants for a transcript, given a set containing heterozygous variants . Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence: AAAAACCCCCGGGGG AAATTTCGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTCCGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGGGCCCCCAAAAA GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) """ dummy_db = DummyAdapter() # 1) INS, SNP, DEL dummy_vars = [var_10, var_11, var_12] trans_gener = Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ) trans = [t for t in trans_gener] trans = map(str, trans) self.assertEqual(len(trans), 8) self.assertTrue("AAATTTGGGGG" in trans) self.assertTrue("AAAAATTTGGGGG" in trans) self.assertTrue("AAATTTCCCCCGGGGG" in trans) self.assertTrue("AAAAATTTCCCCCGGGGG" in trans) self.assertTrue("GGGTTTAAAAA" in trans) self.assertTrue("GGGGGTTTAAAAA" in trans) self.assertTrue("GGGTTTCCCCCAAAAA" in trans) self.assertTrue("GGGGGTTTCCCCCAAAAA" in trans)
def test_heterozygous_variants(self): """ Create multiple transcript variants for a transcript, given a set containing heterozygous variants . Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence: AAAAACCCCCGGGGG AAATTTCGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTCCGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGGGCCCCCAAAAA GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) """ dummy_db = DummyAdapter() # 1) INS, SNP, DEL dummy_vars = [var_10, var_11, var_12] trans_gener = Generator.generate_transcripts_from_variants(dummy_vars, dummy_db) trans = [t for t in trans_gener] trans = map(str, trans) self.assertEqual(len(trans), 8) self.assertTrue("AAATTTGGGGG" in trans) self.assertTrue("AAAAATTTGGGGG" in trans) self.assertTrue("AAATTTCCCCCGGGGG" in trans) self.assertTrue("AAAAATTTCCCCCGGGGG" in trans) self.assertTrue("GGGTTTAAAAA" in trans) self.assertTrue("GGGGGTTTAAAAA" in trans) self.assertTrue("GGGTTTCCCCCAAAAA" in trans) self.assertTrue("GGGGGTTTCCCCCAAAAA" in trans)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument( '-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument( '-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]". format(m=PRED_METH)) parser.add_argument( '-f', "--filter", dest="filter", type=float, help= "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict" ) parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument( '-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig( filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org" ) # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants( vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values( )[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [ x for x in peptides if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set( FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format( f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format( f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str( e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def test__check_for_problematic_variants(self): self.assertTrue( Generator._check_for_problematic_variants([var_2, var_1])) self.assertFalse( Generator._check_for_problematic_variants([var_5, var_6]))
def test__incorp_deletion(self): ts = list("TESTSEQUEASDFGNCES") self.assertEqual(Generator._incorp_deletion(ts, var_4, "tsc_1", 0, 0), -5) self.assertEqual(Generator._incorp_deletion(ts, var_6, "tsc_1", 0, 0), -2)
def test__incorp_insertion(self): ts = list("TESTSEQUENCE") self.assertEqual(Generator._incorp_insertion(ts, var_3, "tsc_1", 0, 0), 2)
def test__incorp_snp(self): ts = list("TESTSEQUENCE") self.assertEqual(Generator._incorp_snp(ts, var_2, "tsc_1", 6, 6), 6)
def test__incorp_deletion(self): ts = list("TESTSEQUEASDFGNCES") self.assertEqual(Generator._incorp_deletion(ts, var_4, "tsc_1", 0, 0), -5) self.assertEqual(Generator._incorp_deletion(ts, var_6, "tsc_1",0, 0), -2)
def test_peptides_from_variants(self): """ Create multiple peptides, given a set containing heterozygous variants . Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence: AAAAACCCCCGGGGG AAATTTGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTGGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGGGCCCCCAAAAA GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) Resulting protein sequences: KFG KNLG KFPPG KNFPRG GFK GGLK GFPPK GGFPQK Resulting peptides of length 3: KFG + KNL + NLG + KFP + FPP + PPG + KNF + NFP + FPR + PRG + GFK + GGL + GLK + GFP + FPP + PPK + GGF + GFP + FPQ + PQK + """ dummy_db = DummyAdapter() exp_peps = set([ 'PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL', 'KFG', 'GGF', 'FPQ', 'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP' ]) # 1) INS, SNP, DEL dummy_vars = [var_10, var_11, var_12] peps = set( map( lambda x: str(x), Generator.generate_peptides_from_variants( dummy_vars, 3, dummy_db, EIdentifierTypes.REFSEQ))) peps_from_prot = set( map( str, Generator.generate_peptides_from_proteins( Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)), 3))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0) self.assertTrue(len(peps - exp_peps) == 0) self.assertTrue(len(exp_peps - peps) == 0)
def test__incorp_snp(self): ts = list("TESTSEQUENCE") print self.assertEqual(Generator._incorp_snp(ts, var_2, "tsc_1", 6, 6), 6)
def __main__(): parser = argparse.ArgumentParser( description= """Individualized Proteins 2.0 \n Script for generation of protein sequences which contain provided variants.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument( '-i', "--identifier", help="<Required> Predictions will be written with this name prefix", required=True) parser.add_argument( '-r', "--reference", help= "Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument( '-db', "--database", help= "Proteome sequence reference database to be attached to individualized sequences", required=True) parser.add_argument('-o', "--output_dir", help="All files written will be put in this directory") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) logging.basicConfig(filename=os.path.join( args.output_dir, '{}_indproteinsDB.log'.format(args.identifier)), filemode='w+', level=logging.DEBUG) logging.info("Starting generation of protein sequences at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) if args.output_dir is not None: try: os.chdir(args.output_dir) logging.info("Using provided data directory: {}".format( str(args.output_dir))) except: logging.info("No such directory, using current.") else: logging.info("Using current data directory.") '''start the actual IRMA functions''' metadata = [] #references = {'GRCh37': 'http://grch37.ensembl.org', 'GRCh38': 'http://ensembl.org'} references = { 'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org' } global transcriptProteinMap '''read in variants''' if args.somatic_mutations.endswith( '.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts = read_vcf(args.somatic_mutations) if args.germline_mutations is not None: if args.germline_mutations.endswith( '.GSvar') or args.germline_mutations.endswith('.tsv'): vl_normal, transcripts_germline, metadata = read_GSvar( args.germline_mutations) elif args.germline_mutations.endswith('.vcf'): vl_normal, transcripts_germline = read_vcf(args.germline_mutations) # combine germline and somatic variants vl = vl + vl_normal transcripts = transcripts_germline + transcripts transcripts = list(set(transcripts)) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) #generate transcripts containing variants, filter for unmutated sequences transcripts = [ g for g in generator.generate_transcripts_from_variants( vl, ma, ID_SYSTEM_USED) if g.vars ] #generate proteins from transcripts, table='Standard', stop_symbol='*', to_stop=True, cds=False proteins = generator.generate_proteins_from_transcripts(transcripts) diff_sequences = {} out_ref = args.database.split('/')[-1].replace( '.fasta', '_{}_individualized_protein_DB.fasta'.format(args.identifier)) cpRef = 'cp {f} {o}'.format(f=args.database, o=out_ref) subprocess.call(cpRef.split()) with open(out_ref, 'a') as outfile: for p in proteins: variants = [] for v in p.vars: variants = variants + p.vars[v] c = [x.coding.values() for x in variants] cf = list(itertools.chain.from_iterable(c)) cds = ','.join([y.cdsMutationSyntax for y in set(cf)]) aas = ','.join([y.aaMutationSyntax for y in set(cf)]) outfile.write('>{}:{}\n'.format(p.transcript_id, aas)) outfile.write('{}\n'.format(str(p))) logging.info("Finished generation of protein sequences at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
def test__check_for_problematic_variants(self): self.assertTrue(Generator._check_for_problematic_variants([var_2, var_1])) self.assertFalse(Generator._check_for_problematic_variants([var_5, var_6]))
def make_predictions_from_variants(variants_all, methods, alleles, minlength, maxlength, martsadapter, protein_db, identifier, metadata, transcriptProteinMap): # list for all peptides and filtered peptides all_peptides = [] all_peptides_filtered = [] # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} # list to hold dataframes for all predictions pred_dataframes = [] prots = [ p for p in generator.generate_proteins_from_transcripts( generator.generate_transcripts_from_variants( variants_all, martsadapter, ID_SYSTEM_USED)) ] for peplen in range(minlength, maxlength): peptide_gen = generator.generate_peptides_from_proteins(prots, peplen) peptides_var = [x for x in peptide_gen] # remove peptides which are not 'variant relevant' peptides = [ x for x in peptides_var if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # filter out self peptides selfies = [str(p) for p in peptides if protein_db.exists(str(p))] filtered_peptides = [p for p in peptides if str(p) not in selfies] all_peptides = all_peptides + peptides all_peptides_filtered = all_peptides_filtered + filtered_peptides results = [] if len(filtered_peptides) > 0: for m in methods: try: results.extend([ EpitopePredictorFactory( m.split('-')[0], version=m.split('-')[1]).predict(filtered_peptides, alleles=alleles) ]) except: logging.warning( "Prediction for length {length} and allele {allele} not possible with {method}." .format(length=peplen, allele=','.join([str(a) for a in alleles]), method=m)) if (len(results) == 0): continue df = results[0].merge_results(results[1:]) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score( conv_allele, peplen) df.insert(0, 'length', df.index.map(create_length_column_value)) df['chr'] = df.index.map(create_variant_chr_column_value) df['pos'] = df.index.map(create_variant_pos_column_value) df['gene'] = df.index.map(create_gene_column_value) df['transcripts'] = df.index.map(create_transcript_column_value) df['proteins'] = df.index.map(create_protein_column_value) df['variant type'] = df.index.map(create_variant_type_column_value) df['synonymous'] = df.index.map(create_variant_syn_column_value) df['homozygous'] = df.index.map(create_variant_hom_column_value) df['variant details (genomic)'] = df.index.map( create_mutationsyntax_genome_column_value) df['variant details (protein)'] = df.index.map( create_mutationsyntax_column_value) # reset index to have index as columns df.reset_index(inplace=True) for c in df.columns: if '*' in str(c): idx = df.columns.get_loc(c) df.insert( idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values( str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) df.insert( idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values( float(x['%s affinity' % c]), x['Method']), axis=1)) df = df.rename(columns={c: '%s score' % c}) df['%s score' % c] = df['%s score' % c].map(lambda x: round(x, 4)) for c in metadata: df[c] = df.apply(lambda row: create_metadata_column_value(row, c), axis=1) df = df.rename(columns={'Seq': 'sequence'}) df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) statistics = { 'prediction_methods': methods, 'number_of_variants': len(variants_all), 'number_of_peptides': len(all_peptides), 'number_of_peptides_after_filtering': len(all_peptides_filtered) } return pred_dataframes, statistics, all_peptides_filtered
def test_peptides_from_variants(self): """ Create multiple peptides, given a set containing heterozygous variants . Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence: AAAAACCCCCGGGGG AAATTTGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTGGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGGGCCCCCAAAAA GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) Resulting protein sequences: KFG KNLG KFPPG KNFPRG GFK GGLK GFPPK GGFPQK Resulting peptides of length 3: KFG + KNL + NLG + KFP + FPP + PPG + KNF + NFP + FPR + PRG + GFK + GGL + GLK + GFP + FPP + PPK + GGF + GFP + FPQ + PQK + """ dummy_db = DummyAdapter() exp_peps = set(['PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL', 'KFG', 'GGF', 'FPQ', 'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP']) # 1) INS, SNP, DEL dummy_vars = [var_10, var_11, var_12] peps = set(map(lambda x: str(x), Generator.generate_peptides_from_variants(dummy_vars, 3, dummy_db))) self.assertTrue(len(peps-exp_peps) == 0) self.assertTrue(len(exp_peps-peps) == 0)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH)) parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict") parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values()[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format(f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)