def test_peptides_from_varaints(self): coding = {} coding['NM_080751'] = MutationSyntax('NM_080751',2629,876,'c.2630C>T','p.Pro877Leu') var = Variant('line0',0,20,2621905,'C','T',coding,True,False) var.gene = 'TMC2' ma = MartsAdapter(biomart="http://ensembl.org") vars = [var, Variant("testInsertion", 2, 20, 2621899, "", "AAAAAA", {'NM_080751':MutationSyntax('NM_080751',2625,876,'c.2630C>T','p.Pro877Leu')}, True, False)] test = Generator.generate_peptides_from_variants(vars, 9, ma, id_type=EIdentifierTypes.REFSEQ, peptides=None) test2 = [x for x in test] print(len(test2)) ts = list() #using a tweaked generator that takes another sequence source if the sequence is too short in respect to the given variants #in this case a newer/older sequence from mart in respect to what was given as reference in the annotation process t = Generator.generate_transcripts_from_variants(vars, ma, id_type=EIdentifierTypes.REFSEQ) ts = [x for x in t] print(len(ts[0])) p = Generator.generate_proteins_from_transcripts(ts, to_stop=True) ps = [x for x in p] e = Generator.generate_peptides_from_proteins(ps, 9) es = [x for x in e] print(len(es)) #print vars print len(vars)
def test_real_life_test(self): mart = MartsAdapter(biomart="http://grch37.ensembl.org/biomart/martservice?query=") ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)), "Data/examples/test_annovar.out") vars = read_annovar_exonic(ano_path) peps = set(map(lambda x: str(x), Generator.generate_peptides_from_variants(vars, 9, mart, EIdentifierTypes.REFSEQ))) peps_from_prot = set(map(str, Generator.generate_peptides_from_proteins( Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants(vars, mart, EIdentifierTypes.REFSEQ)), 9))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0)
def test_real_life_test(self): mart = MartsAdapter( biomart="http://grch37.ensembl.org/biomart/martservice?query=") ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)), "Data/examples/test_annovar.out") vars = read_annovar_exonic(ano_path) peps = set( map( lambda x: str(x), Generator.generate_peptides_from_variants( vars, 9, mart, EIdentifierTypes.REFSEQ))) peps_from_prot = set( map( str, Generator.generate_peptides_from_proteins( Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants( vars, mart, EIdentifierTypes.REFSEQ)), 9))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH)) parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict") parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values()[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format(f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def make_predictions_from_variants(variants_all, methods, alleles, minlength, maxlength, martsadapter, protein_db, identifier, metadata, transcriptProteinMap): # list for all peptides and filtered peptides all_peptides = [] all_peptides_filtered = [] # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} # list to hold dataframes for all predictions pred_dataframes = [] prots = [ p for p in generator.generate_proteins_from_transcripts( generator.generate_transcripts_from_variants( variants_all, martsadapter, ID_SYSTEM_USED)) ] for peplen in range(minlength, maxlength): peptide_gen = generator.generate_peptides_from_proteins(prots, peplen) peptides_var = [x for x in peptide_gen] # remove peptides which are not 'variant relevant' peptides = [ x for x in peptides_var if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # filter out self peptides selfies = [str(p) for p in peptides if protein_db.exists(str(p))] filtered_peptides = [p for p in peptides if str(p) not in selfies] all_peptides = all_peptides + peptides all_peptides_filtered = all_peptides_filtered + filtered_peptides results = [] if len(filtered_peptides) > 0: for m in methods: try: results.extend([ EpitopePredictorFactory( m.split('-')[0], version=m.split('-')[1]).predict(filtered_peptides, alleles=alleles) ]) except: logging.warning( "Prediction for length {length} and allele {allele} not possible with {method}." .format(length=peplen, allele=','.join([str(a) for a in alleles]), method=m)) if (len(results) == 0): continue df = results[0].merge_results(results[1:]) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score( conv_allele, peplen) df.insert(0, 'length', df.index.map(create_length_column_value)) df['chr'] = df.index.map(create_variant_chr_column_value) df['pos'] = df.index.map(create_variant_pos_column_value) df['gene'] = df.index.map(create_gene_column_value) df['transcripts'] = df.index.map(create_transcript_column_value) df['proteins'] = df.index.map(create_protein_column_value) df['variant type'] = df.index.map(create_variant_type_column_value) df['synonymous'] = df.index.map(create_variant_syn_column_value) df['homozygous'] = df.index.map(create_variant_hom_column_value) df['variant details (genomic)'] = df.index.map( create_mutationsyntax_genome_column_value) df['variant details (protein)'] = df.index.map( create_mutationsyntax_column_value) # reset index to have index as columns df.reset_index(inplace=True) for c in df.columns: if '*' in str(c): idx = df.columns.get_loc(c) df.insert( idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values( str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) df.insert( idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values( float(x['%s affinity' % c]), x['Method']), axis=1)) df = df.rename(columns={c: '%s score' % c}) df['%s score' % c] = df['%s score' % c].map(lambda x: round(x, 4)) for c in metadata: df[c] = df.apply(lambda row: create_metadata_column_value(row, c), axis=1) df = df.rename(columns={'Seq': 'sequence'}) df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) statistics = { 'prediction_methods': methods, 'number_of_variants': len(variants_all), 'number_of_peptides': len(all_peptides), 'number_of_peptides_after_filtering': len(all_peptides_filtered) } return pred_dataframes, statistics, all_peptides_filtered
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument( '-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument( '-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]". format(m=PRED_METH)) parser.add_argument( '-f', "--filter", dest="filter", type=float, help= "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict" ) parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument( '-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig( filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org" ) # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants( vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values( )[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [ x for x in peptides if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set( FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format( f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format( f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str( e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def test_peptides_from_variants(self): """ Create multiple peptides, given a set containing heterozygous variants . Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence: AAAAACCCCCGGGGG AAATTTGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTGGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGGGCCCCCAAAAA GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) Resulting protein sequences: KFG KNLG KFPPG KNFPRG GFK GGLK GFPPK GGFPQK Resulting peptides of length 3: KFG + KNL + NLG + KFP + FPP + PPG + KNF + NFP + FPR + PRG + GFK + GGL + GLK + GFP + FPP + PPK + GGF + GFP + FPQ + PQK + """ dummy_db = DummyAdapter() exp_peps = set([ 'PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL', 'KFG', 'GGF', 'FPQ', 'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP' ]) # 1) INS, SNP, DEL dummy_vars = [var_10, var_11, var_12] peps = set( map( lambda x: str(x), Generator.generate_peptides_from_variants( dummy_vars, 3, dummy_db, EIdentifierTypes.REFSEQ))) peps_from_prot = set( map( str, Generator.generate_peptides_from_proteins( Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants( dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)), 3))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0) self.assertTrue(len(peps - exp_peps) == 0) self.assertTrue(len(exp_peps - peps) == 0)
def test_peptides_from_variants(self): """ Create multiple peptides, given a set containing heterozygous variants . Variants: 3-DEL(-2) , 5-INS(+3) , 7-DEL(-4) HET-DEL(-2), HOM-INS(+3), HET-DEL(-1) Reference sequence: AAAAACCCCCGGGGG AAATTTGGGGG (DEL,INS,DEL) AAATTTCCCCCGGGGG (DEL,INS) AAAAATTTGGGGG (INS,DEL) AAAAATTTCCCCCGGGGG (INS) GGGGGCCCCCAAAAA GGGTTTCAAAAA (DEL,INS,DEL) GGGTTTCCCCCAAAAA (DEL,INS) GGGGGTTTCAAAAA (INS,DEL) GGGGGTTTCCCCCAAAAA (INS) Resulting protein sequences: KFG KNLG KFPPG KNFPRG GFK GGLK GFPPK GGFPQK Resulting peptides of length 3: KFG + KNL + NLG + KFP + FPP + PPG + KNF + NFP + FPR + PRG + GFK + GGL + GLK + GFP + FPP + PPK + GGF + GFP + FPQ + PQK + """ dummy_db = DummyAdapter() exp_peps = set(['PRG', 'GLK', 'PPG', 'KFP', 'GFK', 'PPK', 'GFP', 'PQK', 'KNL', 'KFG', 'GGF', 'FPQ', 'FPP', 'NLG', 'FPR', 'KNF', 'GGL', 'NFP']) # 1) INS, SNP, DEL dummy_vars = [var_10, var_11, var_12] peps = set(map(lambda x: str(x), Generator.generate_peptides_from_variants(dummy_vars, 3, dummy_db, EIdentifierTypes.REFSEQ))) peps_from_prot = set(map(str, Generator.generate_peptides_from_proteins(Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)), 3))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0) self.assertTrue(len(peps-exp_peps) == 0) self.assertTrue(len(exp_peps-peps) == 0)