def test_filter_iter(self): psms = iter(self.psms) f = aux.filter(psms, key=self.key, is_decoy=self.is_decoy, fdr=0.5) self.assertEqual(f.shape[0], 26) psms = iter(self.psms) with aux.filter(psms, key=self.key, is_decoy=self.is_decoy, fdr=0.5, full_output=False) as f: f1 = list(f) self.assertEqual(len(f1), 26)
def test_filter_pep_iter(self): psms = iter(self.psms) f = aux.filter(psms, key=self.key, pep=self.pep, fdr=0.02) self.assertEqual(f.shape[0], 21) psms = iter(self.psms) with aux.filter(psms, key=self.key, pep=self.pep, fdr=0.02, full_output=False) as f: f1 = list(f) self.assertEqual(len(f1), 21)
def filter_evalue_new(self, FDR=1, FDR2=1, useMP=True, drop_decoy=True, toprint=True): "A function for filtering PSMs by e-value and MP-score with some FDR" isdecoy = lambda x: x[0].note == 'decoy' escore = lambda x: float(x[0].evalue) mscore = lambda x: -float(x[0].peptscore) new_peptides = self.copy_empty() for infile in self.get_infiles(): infile_peptides = [] for val in self.get_izip_full(): # for peptide, spectrum in izip(self.peptideslist, self.spectrumlist): # if peptide.infile == infile: if val[0].infile == infile: infile_peptides.append(val) filtered_peptides = aux.filter(infile_peptides, fdr=float(FDR)/100, key=escore, is_decoy=isdecoy, remove_decoy=False, formula=1, full_output=True) qvals_e = aux.qvalues(filtered_peptides, key=escore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1, full_output=True) try: best_cut_evalue = max(escore(p) for p in filtered_peptides) real_FDR = round(aux.fdr(filtered_peptides, is_decoy=isdecoy) * 100, 1) except: best_cut_evalue = 0 real_FDR = 0 if toprint: logger.info('%s %s e-value', real_FDR, best_cut_evalue) best_cut_peptscore = 1.1 if useMP: tmp_peptides = [] for p in infile_peptides: if escore(p) > best_cut_evalue: tmp_peptides.append(p) filtered_peptides = aux.filter(tmp_peptides, fdr=float(FDR2)/100, key=mscore, is_decoy=isdecoy, remove_decoy=False, formula=1, full_output=True) qvals_m = aux.qvalues(filtered_peptides, key=mscore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1, full_output=True) try: best_cut_peptscore = min(float(p[0].peptscore) for p in filtered_peptides) real_FDR = round(aux.fdr(filtered_peptides, is_decoy=isdecoy) * 100, 1) except: best_cut_peptscore = 1.1 real_FDR = 0 if toprint: logger.info('%s %s MP score', real_FDR, best_cut_peptscore) for val in qvals_e: val[-1][0].qval = val[-2] new_peptides.add_elem(val[-1]) # new_peptides.peptideslist.append(val[-1][0]) # new_peptides.peptideslist[-1].qval = val[-2] # new_peptides.spectrumlist.append(val[-1][1]) if useMP: for val in qvals_m: val[-1][0].qval = val[-2] new_peptides.add_elem(val[-1]) # new_peptides.peptideslist.append(val[-1][0]) # new_peptides.peptideslist[-1].qval = val[-2] # new_peptides.spectrumlist.append(val[-1][1]) # new_peptides.spectrumlist = np.array(new_peptides.spectrumlist) new_peptides.check_arrays() if drop_decoy: new_peptides.filter_decoy() return (new_peptides, best_cut_evalue, best_cut_peptscore)
def test_filter_pep_array_iter_key_str_is_decoy(self): dtype = [('score', np.int8), ('label', np.str_, 1), ('pep', np.float64), ('is decoy', np.bool)] psms = np.array([(s, l, p, self.is_decoy((s, l, p))) for s, l, p in self.psms], dtype=dtype) key = iter([self.key(psm) for psm in psms]) f = aux.filter(psms, key=key, pep='pep', fdr=0.02) self.assertEqual(f.shape[0], 21) key = iter(self.key(psm) for psm in psms) with aux.filter(psms, key=key, pep='pep', fdr=0.02, full_output=False) as f: f1 = list(f) self.assertEqual(len(f1), 21)
def test_filter_pep_array_gen_key(self): dtype = [('score', np.int8), ('label', np.str_, 1), ('pep', np.float64)] psms = np.array(self.psms, dtype=dtype) key = (self.key(psm) for psm in psms) f = aux.filter(psms, key=key, pep=self.pep, fdr=0.02) self.assertEqual(f.shape[0], 21) key = (self.key(psm) for psm in psms) with aux.filter(psms, key=key, pep=self.pep, fdr=0.02, full_output=False) as f: f11 = list(f) self.assertEqual(len(f11), 21)
def test_filter_pep_two_iters(self): i = np.random.randint(1, len(self.psms)-1) psms1 = iter(self.psms[:i]) psms2 = iter(self.psms[i:]) f = aux.filter(psms1, psms2, key=self.key, pep=self.pep, fdr=0.02) self.assertEqual(f.shape[0], 21) psms1 = iter(self.psms[:i]) psms2 = iter(self.psms[i:]) with aux.filter(psms1, psms2, key=self.key, pep=self.pep, fdr=0.02, full_output=False) as f: f1 = list(f) self.assertEqual(len(f1), 21)
def test_filter_two_iters(self): i = np.random.randint(1, len(self.psms)-1) psms1 = iter(self.psms[:i]) psms2 = iter(self.psms[i:]) f11 = aux.filter(psms1, psms2, key=self.key, is_decoy=self.is_decoy, fdr=0.5) self.assertEqual(f11.shape[0], 26) psms1 = iter(self.psms[:i]) psms2 = iter(self.psms[i:]) with aux.filter(psms1, psms2, key=self.key, is_decoy=self.is_decoy, fdr=0.5, full_output=False) as f: f11 = list(f) self.assertEqual(len(f11), 26)
def test_filter_pep_two_dataframes_str_key_iter_pep(self): i = np.random.randint(1, len(self.psms)-1) dtype = [('score', np.int8), ('label', np.str_, 1), ('pep', np.float64), ('is decoy', np.bool)] psms = np.array([(s, l, p, self.is_decoy((s, l, p))) for s, l, p in self.psms], dtype=dtype) pep = iter(psms['pep']) psms1 = pd.DataFrame(psms[:i]) psms2 = pd.DataFrame(psms[i:]) f = aux.filter(psms1, psms2, key='score', pep=pep, fdr=0.02) self.assertEqual(f.shape[0], 21) pep = iter(psms['pep']) with aux.filter(psms1, psms2, key='score', pep=pep, fdr=0.02, full_output=False) as f: f1 = list(f) self.assertEqual(len(f1), 21)
def _run_check(self, *args, **kwargs): key = kwargs.get('key', self.key) is_decoy = kwargs.get('is_decoy', self.is_decoy) f11 = aux.filter(*args, key=key, is_decoy=is_decoy, fdr=0.5) f12 = aux.filter(*args, key=key, is_decoy=is_decoy, fdr=0.5, formula=2) f21 = aux.filter(*args, key=key, is_decoy=is_decoy, fdr=0.5, remove_decoy=False, formula=1) f22 = aux.filter(*args, key=key, is_decoy=is_decoy, fdr=0.5, remove_decoy=False) self.assertEqual(f11.shape[0], 26) self.assertEqual(f12.shape[0], 26) self.assertEqual(f21.shape[0], 39) self.assertEqual(f22.shape[0], 34) with aux.filter(*args, key=key, is_decoy=is_decoy, fdr=0.5, full_output=False) as f: f11 = list(f) with aux.filter(*args, key=key, is_decoy=is_decoy, fdr=0.5, formula=2, full_output=False) as f: f12 = list(f) with aux.filter(*args, key=key, is_decoy=is_decoy, fdr=0.5, remove_decoy=False, formula=1, full_output=False) as f: f21 = list(f) with aux.filter(*args, key=key, is_decoy=is_decoy, fdr=0.5, remove_decoy=False, full_output=False) as f: f22 = list(f) self.assertEqual(len(f11), 26) self.assertEqual(len(f12), 26) self.assertEqual(len(f21), 39) self.assertEqual(len(f22), 34)
def _run_check_pep(self, *args, **kwargs): key = kwargs.pop('key', self.key) f11 = aux.filter(*args, key=key, fdr=0.02, **kwargs) f12 = aux.filter(*args, fdr=0.02, **kwargs) self.assertEqual(f11.shape[0], 21) self.assertEqual(f12.shape[0], 21) with aux.filter(*args, key=key, fdr=0.02, full_output=False, **kwargs) as f: f11 = list(f) with aux.filter(*args, fdr=0.02, full_output=False, **kwargs) as f: f12 = list(f) self.assertEqual(len(f11), 21) self.assertEqual(len(f12), 21)
def get_subset(results, settings, fdr=0.01): """Filter results to given FDR using top 1 candidates""" subset = aux.filter(results, key=lambda x: x['e-values'][0], is_decoy=lambda x: x['candidates'][0][2] == 'd', fdr=fdr) return subset
def test_filter_empty_dataframe_str_key_str_is_decoy(self): # dtype = [('score', np.int8), ('label', np.str_, 1), ('pep', np.float64), ('is decoy', np.bool)] psms = pd.DataFrame({'score': [], 'is decoy': []}) f = aux.filter(psms, key='score', is_decoy='is decoy', fdr=0.1) self.assertEqual(f.shape[0], 0) f = aux.qvalues(psms, key='score', is_decoy='is decoy', remove_decoy=False, formula=1, full_output=True, fdr=0.01) self.assertEqual(f.shape[0], 0)
def test_filter_empty_dataframe(self): dtype = [('score', np.int8), ('label', np.str_, 1), ('pep', np.float64)] psms = pd.DataFrame(np.array([], dtype=dtype)) f = aux.filter(psms, key=self.key, is_decoy=self.is_decoy, remove_decoy=False, formula=1, fdr=0.1) self.assertEqual(f.shape[0], 0) f = aux.qvalues(psms, key=self.key, is_decoy=self.is_decoy, remove_decoy=False, formula=1, full_output=True, fdr=0.1) self.assertEqual(f.shape[0], 0)
def show_info(args): with pepxml.PepXML(args.file) as f: psms = list(f) fpsms = aux.filter( psms, is_decoy=lambda x: pepxml.is_decoy(x, args.decoy_prefix), fdr=args.fdr, key=pepxml._key, ) logger.info(args.format, args.file, len(psms), fpsms.size)
def filter_evalue_prots(prots, FDR=1.0, remove_decoy=True, dec_prefix='DECOY_'): proteins = prots.items() isdecoy = lambda x: x[0].startswith(dec_prefix) escore = lambda x: float(x[1]['expect']) filtered_proteins = aux.filter(proteins, fdr=float(FDR) / 100, key=escore, is_decoy=isdecoy, remove_decoy=False, formula=1, full_output=True) qvals_e = aux.qvalues(filtered_proteins, key=escore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1, full_output=True) new_prots = {} for val in qvals_e: val[-1][1]['qval'] = val[-2] if (not remove_decoy or not val[-1][0].startswith(dec_prefix)): new_prots[val[-1][0]] = val[-1][1] logger.info('Actual protein-level FDR = %.2f%%', aux.fdr(filtered_proteins, is_decoy=isdecoy) * 100) return new_prots
key=lambda x: x['SpectrumIdentificationItem'][0]['MS-GF:EValue'], fdr=0.01) pylab.figure() pylab.hist( [psm['SpectrumIdentificationItem'][0]['chargeState'] for psm in msgf], bins=np.arange(5), align='left') pylab.xticks(np.linspace(0, 4, 5)) pylab.xlabel('charge state') morpheus = pd.read_table('example.PSMs.tsv') amanda = pd.read_table('example_output.csv', skiprows=1) morph_filt = aux.filter(morpheus, fdr=0.01, key='Morpheus Score', reverse=True, is_decoy='Decoy?') pylab.figure() morph_filt.plot(x='Retention Time (minutes)', y='Precursor Mass (Da)', kind='scatter') amanda['isDecoy'] = [ all(s.startswith('DECOY') for s in prot.split(';')) for prot in amanda['Protein Accessions'] ] amanda_filt = aux.filter(amanda[amanda['Rank'] == 1], key='Weighted Probability', is_decoy='isDecoy',