def test_fdr_iter(self): self.assertAlmostEqual(aux.fdr(iter(psms), is_decoy=self.is_decoy), 1.0) self.assertAlmostEqual(aux.fdr(iter(psms), pep=self.pep), 0.0355) isd = [self.is_decoy((s, l, p)) for s, l, p in psms] pep = [self.pep((s, l, p)) for s, l, p in psms] self.assertAlmostEqual(aux.fdr(iter(psms), is_decoy=iter(isd)), 1.0) self.assertAlmostEqual(aux.fdr(iter(psms), pep=iter(pep)), 0.0355)
def filter_evalue_new(self, FDR=1, FDR2=1, useMP=True, drop_decoy=True, toprint=True): "A function for filtering PSMs by e-value and MP-score with some FDR" isdecoy = lambda x: x[0].note == 'decoy' escore = lambda x: float(x[0].evalue) mscore = lambda x: -float(x[0].peptscore) new_peptides = self.copy_empty() for infile in self.get_infiles(): infile_peptides = [] for val in self.get_izip_full(): # for peptide, spectrum in izip(self.peptideslist, self.spectrumlist): # if peptide.infile == infile: if val[0].infile == infile: infile_peptides.append(val) filtered_peptides = aux.filter(infile_peptides, fdr=float(FDR)/100, key=escore, is_decoy=isdecoy, remove_decoy=False, formula=1, full_output=True) qvals_e = aux.qvalues(filtered_peptides, key=escore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1, full_output=True) try: best_cut_evalue = max(escore(p) for p in filtered_peptides) real_FDR = round(aux.fdr(filtered_peptides, is_decoy=isdecoy) * 100, 1) except: best_cut_evalue = 0 real_FDR = 0 if toprint: logger.info('%s %s e-value', real_FDR, best_cut_evalue) best_cut_peptscore = 1.1 if useMP: tmp_peptides = [] for p in infile_peptides: if escore(p) > best_cut_evalue: tmp_peptides.append(p) filtered_peptides = aux.filter(tmp_peptides, fdr=float(FDR2)/100, key=mscore, is_decoy=isdecoy, remove_decoy=False, formula=1, full_output=True) qvals_m = aux.qvalues(filtered_peptides, key=mscore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1, full_output=True) try: best_cut_peptscore = min(float(p[0].peptscore) for p in filtered_peptides) real_FDR = round(aux.fdr(filtered_peptides, is_decoy=isdecoy) * 100, 1) except: best_cut_peptscore = 1.1 real_FDR = 0 if toprint: logger.info('%s %s MP score', real_FDR, best_cut_peptscore) for val in qvals_e: val[-1][0].qval = val[-2] new_peptides.add_elem(val[-1]) # new_peptides.peptideslist.append(val[-1][0]) # new_peptides.peptideslist[-1].qval = val[-2] # new_peptides.spectrumlist.append(val[-1][1]) if useMP: for val in qvals_m: val[-1][0].qval = val[-2] new_peptides.add_elem(val[-1]) # new_peptides.peptideslist.append(val[-1][0]) # new_peptides.peptideslist[-1].qval = val[-2] # new_peptides.spectrumlist.append(val[-1][1]) # new_peptides.spectrumlist = np.array(new_peptides.spectrumlist) new_peptides.check_arrays() if drop_decoy: new_peptides.filter_decoy() return (new_peptides, best_cut_evalue, best_cut_peptscore)
def filter_evalue_prots(prots, FDR=1.0, remove_decoy=True, dec_prefix='DECOY_'): proteins = prots.items() isdecoy = lambda x: x[0].startswith(dec_prefix) escore = lambda x: float(x[1]['expect']) filtered_proteins = aux.filter(proteins, fdr=float(FDR) / 100, key=escore, is_decoy=isdecoy, remove_decoy=False, formula=1, full_output=True) qvals_e = aux.qvalues(filtered_proteins, key=escore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1, full_output=True) new_prots = {} for val in qvals_e: val[-1][1]['qval'] = val[-2] if (not remove_decoy or not val[-1][0].startswith(dec_prefix)): new_prots[val[-1][0]] = val[-1][1] logger.info('Actual protein-level FDR = %.2f%%', aux.fdr(filtered_proteins, is_decoy=isdecoy) * 100) return new_prots
def process_files(args): """Run Scavager for multiple files (`args['file']` should be a list of file names) and possibly for their union. Parameters ---------- args : dict A dictionary of parameters as produced from argparse in :py:func:`search.run`. Returns ------- out : int Exit code. 0 for success, 1 for empty (union) result, negative for errors (first encountered error code is returned). """ files = args['file'] N = len(files) logger.info('%d file(s) to process.', N) cargs = args.copy() if args['union'] or args['quick_union']: if not args['database']: logger.error('--database is required for union calculation.') return -101 if args['create_pepxml'] and pepxmltk is None: logger.error( 'pepxmltk is required for --create-pepxml. Please install it.') return -102 if args['database']: decoy_prots_2 = utils.split_fasta_decoys(args['database'], args['prefix'], args['infix']) else: decoy_prots_2 = None logger.info( 'Database file not provided. Decoy randomization will be done per PSM file.' ) errors = 0 retvalues = [] if not args['quick_union']: for f in files: cargs['file'] = f retv = process_file(cargs, decoy2=decoy_prots_2) retvalues.append(retv) if -10 < retv < 0: logger.info('Stopping due to previous errors.') return retv if retv < 0: errors += 1 else: logger.info('Skipping individual file processing.') if N == 1: return retv if errors >= N - 1: if N > 1: logger.info( 'Union will not be run because %s out of %s files were processed with errors.', errors, N) return retvalues if args['union'] or args['quick_union']: logger.info('Starting the union calculation...') psm_full_dfs = [] for file in files: outfolder = utils.get_output_folder(args['output'], file) outbasename = utils.get_output_basename(file, args['name_suffix']) csvname = utils.filename(outfolder, outbasename, 'psm_full') try: df = pd.read_csv(csvname, sep='\t') for key in [ 'protein', 'peptide_next_aa', 'peptide_prev_aa', 'num_tol_term', 'protein_descr', 'modifications', 'mods_counter' ]: df[key] = df[key].apply(ast.literal_eval) psm_full_dfs.append(df) except FileNotFoundError: logger.warning('File %s not found, skipping...', csvname) all_psms = pd.concat(psm_full_dfs, sort=False) all_psms.reset_index(inplace=True, drop=True) logger.debug( 'Recovered PSMs for analysis: %s, of those: %s decoy1, %s decoy2, %s have q < %s', all_psms.shape, all_psms.decoy1.sum(), all_psms.decoy2.sum(), (all_psms['q'] < args['fdr'] / 100).sum(), args['fdr'] / 100) utils.prepare_mods(all_psms) q_label = 'q' if not (args['no_correction'] or args['force_correction']): logger.info('Using the corrected q-values for union.') all_psms_f2 = all_psms[(~all_psms['decoy1']) & (all_psms[q_label] < args['fdr'] / 100)] if not all_psms_f2.shape[0]: q_label = 'q_uncorrected' logger.info('No union results with correction. Disabling...') if args['no_correction']: q_label = 'q_uncorrected' logger.debug('Filtering union PSMs by %s.', q_label) all_psms_f2 = all_psms.loc[(~all_psms['decoy1']) & ( all_psms[q_label] < args['fdr'] / 100)].copy() peptides, peptides_f, proteins, proteins_f, protein_groups = build_output_tables( all_psms, all_psms_f2, decoy_prots_2, args, 'PEP', calc_qvals=False) if peptides is None: logger.warning('No peptides identified in union.') retvalues.append(1) return retvalues logger.debug('Protein FDR in full table: %f%%', 100 * aux.fdr(proteins, is_decoy='decoy2')) write_tables(outfolder, 'union' + args['name_suffix'], all_psms, all_psms_f2, peptides_f, proteins_f, protein_groups) if args['create_pepxml']: pepxmltk.easy_write_pepxml( files, utils.filename(outfolder, 'union', 'pepxml'), set(all_psms_f2.loc[~all_psms_f2['decoy2'], 'spectrum'])) if len(all_psms_f2[~all_psms_f2['decoy2']]) >= 3: plot_outfigures(all_psms, all_psms_f2[~all_psms_f2['decoy2']], peptides, peptides_f[~peptides_f['decoy2']], outfolder, 'union' + args['name_suffix'], df_proteins=proteins, df_proteins_f=proteins_f[~proteins_f['decoy2']], separate_figures=args['separate_figures']) logger.info('Union calculation complete.') retvalues.append(0) return retvalues
def _run_check(self, psms, **kwargs): is_decoy = kwargs.pop('is_decoy', self.is_decoy) pep = kwargs.pop('pep', self.pep) self.assertAlmostEqual(aux.fdr(psms, is_decoy=is_decoy, formula=1), 1.0) self.assertAlmostEqual(aux.fdr(psms, is_decoy=is_decoy, formula=2), 1.0) self.assertAlmostEqual(aux.fdr(psms, pep=pep), 0.0355)