class ConfigForm(Form): path = TextField('output path', default='results', validators=[DataRequired()], render_kw={"class": "textbox"}) pm = [(i,i) for i in base.predictors] predictors = SelectMultipleField('predictors', choices=pm, render_kw={"class": "combobox"}) mhc1_length = IntegerField('mhc1 length', default=11) mhc2_length = IntegerField('mhc2 length', default=15) sequence_file = TextField('sequence file', validators=[DataRequired(), is_seqfile(), exists()], default='') overwrite = BooleanField('overwrite', default=False, false_values={False, 'n', ''}) cpus = IntegerField('cpus', default=1) ps1 = [(i,i) for i in base.mhc1_presets] ps1.insert(0, ('','')) mhc1_presets = SelectField('MHC-I presets', choices=ps1, default='') ps2 = [(i,i) for i in base.mhc2_presets] ps2.insert(0, ('','')) mhc2_presets = SelectField('MHC-II presets', choices=ps2, default='') p1 = base.get_predictor('iedbmhc1') x = [(i,i) for i in p1.get_alleles()] mhc1_alleles = SelectMultipleField('MHC-I alleles', choices=x, render_kw={"class": "combobox"}) p2 = base.get_predictor('tepitope') x = [(i,i) for i in p2.get_alleles()] #drballeles = base.getDRBList(mhc2alleles) #dqpalleles = base.getDQPList(mhc2alleles) mhc2_alleles = SelectMultipleField('MHC-II alleles', choices=x, render_kw={"class": "combobox"}) mhc2_alleles.size=5 iedbmhc1_path = TextField('iedb MHC-I tools path') iedbmhc2_path = TextField('iedb MHC-II tools path')
def run(self): """Run prediction workflow""" preds = [] for p in self.predictors: P = base.get_predictor(p) if P.check_install() == False: print ('%s not installed' %P.name) continue savepath = os.path.join(self.path, p) if self.overwrite == True and os.path.exists(savepath): shutil.rmtree(savepath) if p in base.mhc1_predictors: a = self.mhc1_alleles length = self.mhc1_length check_mhc1_length(length) method = self.iedb_mhc1_method else: a = self.mhc2_alleles length = self.mhc2_length method = self.iedb_mhc2_method if method == '': method = None print ('predictor:', p) print ('alleles:',', '.join(a)) print ('length:',length) print ('cpus:', self.cpus) if 'iedb' in p: if iedb_checks(method) == False: continue if self.peptides is not None: P.predict_peptides(self.peptides, length=length, alleles=a, path=self.path, overwrite=self.overwrite, verbose=self.verbose, method=method, cpus=self.cpus, compression=self.compression) #load the results into the predictor P.load() else: #print (savepath) #data saved to disk to avoid large memory usage P.predict_proteins(self.sequences, length=length, alleles=a, names=self.names, path=savepath, overwrite=self.overwrite, verbose=self.verbose, method=method, cpus=self.cpus, compression=self.compression) #keep reference to path where results saved P.path = savepath #clear data as we will reload during analysis from disk P.data = None #if P.data is None: # print ('no results were found, did predictor run?') # return preds.append(P) n = self.n print ('-----------------------------') self.preds = preds self.analysis() #if self.plots == True: # self.plot_results() print ('results saved in the folder %s' %self.path) return
def get_results(path, predictor, name): filename = os.path.join(path, predictor, name) P = base.get_predictor(predictor) P.load(filename + '.csv') #print filename #print P.data return P
def predict_binding(df, predictor='netmhcpan', alleles=[], verbose=False, cpus=1, cutoff=.95, cutoff_method='default'): """ Predict binding scores for mutated and wt peptides (if present) from supplied variants. Args: df: pandas dataframe with peptide sequences, requires at least 2 columns 'peptide' - the mutant peptide 'wt' - a corresponding wild type peptide this data could be generated from get_mutant_sequences or from an external program predictor: mhc binding prediction method alleles: list of alleles Returns: dataframe with mutant and wt binding scores for all alleles """ P = base.get_predictor(predictor, scoring='ligand') print (P) print ('predicting mhc binding for %s peptides with %s' %(len(df), P.name)) peps = list(df.peptide) res = P.predict_peptides(peps, alleles=alleles, cpus=cpus, cutoff=cutoff, cutoff_method=cutoff_method, drop_columns=True) if res is None: print ('no binding predictions!') return #predict closest matching peptide affinity if verbose == True: print ('predicting wt peptides') wtpeps = list(df.closest) #print wild type peptides b_wt = P.predict_peptides(wtpeps, alleles=alleles, cpus=cpus, cutoff=cutoff, cutoff_method=cutoff_method, drop_columns=True) #combine mutant and matching binding predictions res = combine_wt_scores(res, b_wt, P.scorekey) res = res.drop(['pos','name'],1) #combine binding results with main dataframe res = df.merge(res, on='peptide') res['binding_diff'] = res[P.scorekey]/res.matched_score #anchor position mutated in any 9-mers res['anchor'] = res.apply(anchor_mutated, 1) #hydrophobicity and net charge res = analysis.peptide_properties(res, 'peptide') res['length'] = res.peptide.str.len() #merge promiscuity measure into results #if len(pb) > 0: # res = res.merge(pb[['peptide','alleles']], on='peptide',how='left') #else: # res['alleles'] = 0 #rename some columns res = res.rename(columns={'rank':'binding_rank','alleles':'promiscuity'}) res = res.sort_values('binding_rank', ascending=True) return res
def run(self): """Run workflow""" preds = [] if self.names == None: self.names = self.sequences.locus_tag for p in self.predictors: P = base.get_predictor(p) savepath = os.path.join(self.path, p) if self.overwrite == True and os.path.exists(savepath): shutil.rmtree(savepath) if p in ['iedbmhc1', 'mhcflurry']: a = self.mhc1_alleles length = self.mhc1_length check_mhc1_length(length) method = self.iedb_mhc1_method else: a = self.mhc2_alleles length = self.mhc2_length method = self.iedb_mhc2_method if method == '': method = None print('predictor:', p) print('alleles:', a) print('length:', length) print('cpus:', self.cpus) if 'iedb' in p: if iedb_checks(method) == False: continue P.predictProteins(self.sequences, length=length, alleles=a, names=self.names, path=savepath, overwrite=self.overwrite, verbose=self.verbose, method=method, cpus=self.cpus) #load results into predictor P.load(path=savepath) if P.data is None: print('no results were found, did predictor run?') return preds.append(P) #print (preds) cutoff = self.cutoff cutoff_method = self.cutoff_method n = self.n print('-----------------------------') self.preds = preds self.analysis() if self.plots == True: self.plot_results() return
def check_installed(): """Check which predictors can be used""" cl=base.get_predictor_classes() found=[] for i in cl: P=base.get_predictor(i) st = P.check_install() if st is True: found.append(i) return found
def list_alleles(): for p in base.predictors: print(p) print('-----------------------------') P = base.get_predictor(p) x = P.getAlleles() if type(x) is list: for i in x: print(i) print() return
def list_alleles(): n=7 for p in base.predictors: print (p) print ('-----------------------------') P = base.get_predictor(p) x = P.get_alleles() if type(x) is list: for i in range(0, len(x), n): l=x[i:i+n] print(', '.join(l)) print () return
def check_iedbmhc2_path(): P = base.get_predictor('iedbmhc2') if not os.path.exists(P.iedbmhc2_path): print ('IEDB MHC-II tools not found, check path') return False
def run(self): """Run workflow for multiple samples and prediction methods.""" print('running neoepitope predictions') path = self.path overwrite = self.overwrite files = self.vcf_files preds = self.predictors labels = self.get_file_labels(files) cutoffs = self.cutoffs if len(cutoffs) < len(preds): cutoffs = [cutoffs[0] for p in preds] for f in labels: print(f) infile = labels[f]['filename'] #file to save variants to, if present we can skip eff_csv = os.path.join(path, 'variant_effects_%s.csv' % f) eff_obj = os.path.join(path, 'variant_effects_%s.pickle' % f) if not os.path.exists(eff_obj) or overwrite == True: #get variant effects for each file and then iterate over predictors variants = load_variants(vcf_file=infile) labels[f]['variants'] = len(variants) print('getting variant effects') effects = get_variant_effects(variants, self.verbose) #serialize variant effects effects_to_pickle(effects, eff_obj) else: #else reload from last run effects = pickle.load(open(eff_obj, 'rb')) #save effects as table eff_data = effects_to_dataframe(effects) eff_data['sample'] = f eff_data.to_csv(eff_csv) i = 0 for predictor in self.predictors: outfile = os.path.join(path, 'results_%s_%s.csv' % (f, predictor)) if os.path.exists(outfile) and overwrite == False: continue if predictor in base.mhc1_predictors: alleles = self.mhc1_alleles length = self.mhc1_length else: alleles = self.mhc2_alleles length = self.mhc2_length seqs = get_mutant_sequences(effects=effects, length=length, verbose=self.verbose) res = predict_variants(seqs, alleles=alleles, length=length, predictor=predictor, verbose=self.verbose, cpus=self.cpus) res['label'] = f res.to_csv(outfile, index=False) #gets promiscuous binders based on the cutoff P = base.get_predictor(predictor) P.data = res #pb = P.promiscuous_binders(n=1, keep_columns=True, cutoff=cutoffs[i]) #pb['label'] = f #print (pb[:20]) #pb.to_csv(os.path.join(path, 'binders_%s_%s.csv' %(f,predictor)), index=False) i += 1 #peps = self_similarity(res, proteome="human_proteome") #combine results for multiple samples pd.DataFrame(labels).T.to_csv(os.path.join(path, 'sample_labels.csv')) print('finished, results saved to %s' % path) return
def predict_variants(df, predictor='tepitope', alleles=[], verbose=False, cpus=1, cutoff=.95, cutoff_method='default'): """ Predict binding scores for mutated and wt peptides (if present) from supplied variants. Args: df: pandas dataframe with peptide sequences, requires at least 2 columns 'peptide' - the mutant peptide 'wt' - a corresponding wild type peptide this data could be generated from get_mutant_sequences or from an external source predictor: mhc binding prediction method alleles: list of alleles Returns: dataframe with mutant and wt binding scores for all alleles plus other score metrics """ #find matches to self proteome, adds penalty score column to df #we should only blast non-duplicates.... if verbose == True: print('finding matches to self proteome') df = self_matches(df, cpus=cpus) if verbose == True: print('finding matches to viral proteomes') df = virus_matches(df, cpus=cpus) #get similarity scores for wt and closest match to proteome df['wt_similarity'] = df.apply(wt_similarity, 1) df['self_similarity'] = df.apply(self_similarity, 1) df['virus_similarity'] = df.apply(virus_similarity, 1) #get closest peptide in another column, either wt or nearest self df['closest'] = df.apply(get_closest_match, 1) P = base.get_predictor(predictor) if verbose == True: print(P) print('predicting mhc binding for %s peptides with %s' % (len(df), P.name)) peps = list(df.peptide) res = P.predict_peptides(peps, alleles=alleles, cpus=cpus, cutoff=cutoff, cutoff_method=cutoff_method, drop_columns=True) pb = P.promiscuous_binders(n=1, cutoff=.95) if res is None: print('no binding predictions!') return #predict closest matching peptide affinity if verbose == True: print('predicting wt peptides') wtpeps = list(df.closest) #print wild type peptides b_wt = P.predict_peptides(wtpeps, alleles=alleles, cpus=cpus, cutoff=cutoff, cutoff_method=cutoff_method, drop_columns=True) #combine mutant and matching binding predictions res = combine_wt_scores(res, b_wt, P.scorekey) res = res.drop(['pos', 'name'], 1) #combine binding results with main dataframe res = df.merge(res, on='peptide') res['binding_diff'] = res[P.scorekey] / res.matched_score #hydrophobicity and net charge res = analysis.peptide_properties(res, 'peptide') #print(res) #exclude exact matches to self print('%s peptides with exact matches to self' % len(res[res.self_mismatches == 0])) #merge promiscuity measure into results if len(pb) > 0: res = res.merge(pb[['peptide', 'alleles']], on='peptide', how='left') else: print('no promiscuous binders') res['alleles'] = 0 #rename some columns res = res.rename(columns={ 'rank': 'binding_rank', 'alleles': 'promiscuity' }) return res