Exemple #1
0
 def test_fdr_iter(self):
     self.assertAlmostEqual(aux.fdr(iter(psms), is_decoy=self.is_decoy), 1.0)
     self.assertAlmostEqual(aux.fdr(iter(psms), pep=self.pep), 0.0355)
     isd = [self.is_decoy((s, l, p)) for s, l, p in psms]
     pep = [self.pep((s, l, p)) for s, l, p in psms]
     self.assertAlmostEqual(aux.fdr(iter(psms), is_decoy=iter(isd)), 1.0)
     self.assertAlmostEqual(aux.fdr(iter(psms), pep=iter(pep)), 0.0355)
Exemple #2
0
    def filter_evalue_new(self, FDR=1, FDR2=1, useMP=True, drop_decoy=True, toprint=True):
        "A function for filtering PSMs by e-value and MP-score with some FDR"
        isdecoy = lambda x: x[0].note == 'decoy'
        escore = lambda x: float(x[0].evalue)
        mscore = lambda x: -float(x[0].peptscore)

        new_peptides = self.copy_empty()
        for infile in self.get_infiles():
            infile_peptides = []
            for val in self.get_izip_full():
            # for peptide, spectrum in izip(self.peptideslist, self.spectrumlist):
            #     if peptide.infile == infile:
                if val[0].infile == infile:
                    infile_peptides.append(val)
            filtered_peptides = aux.filter(infile_peptides, fdr=float(FDR)/100, key=escore, is_decoy=isdecoy, remove_decoy=False, formula=1, full_output=True)
            qvals_e = aux.qvalues(filtered_peptides, key=escore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1, full_output=True)
            try:
                best_cut_evalue = max(escore(p) for p in filtered_peptides)
                real_FDR = round(aux.fdr(filtered_peptides, is_decoy=isdecoy) * 100, 1)
            except:
                best_cut_evalue = 0
                real_FDR = 0
            if toprint:
                logger.info('%s %s e-value', real_FDR, best_cut_evalue)
            best_cut_peptscore = 1.1
            if useMP:
                tmp_peptides = []
                for p in infile_peptides:
                    if escore(p) > best_cut_evalue:
                        tmp_peptides.append(p)
                filtered_peptides = aux.filter(tmp_peptides, fdr=float(FDR2)/100, key=mscore, is_decoy=isdecoy, remove_decoy=False, formula=1, full_output=True)
                qvals_m = aux.qvalues(filtered_peptides, key=mscore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1, full_output=True)
                try:
                    best_cut_peptscore = min(float(p[0].peptscore) for p in filtered_peptides)
                    real_FDR = round(aux.fdr(filtered_peptides, is_decoy=isdecoy) * 100, 1)
                except:
                    best_cut_peptscore = 1.1
                    real_FDR = 0
                if toprint:
                    logger.info('%s %s MP score', real_FDR, best_cut_peptscore)
            for val in qvals_e:
                val[-1][0].qval = val[-2]
                new_peptides.add_elem(val[-1])
                # new_peptides.peptideslist.append(val[-1][0])
                # new_peptides.peptideslist[-1].qval = val[-2]
                # new_peptides.spectrumlist.append(val[-1][1])
            if useMP:
                for val in qvals_m:
                    val[-1][0].qval = val[-2]
                    new_peptides.add_elem(val[-1])
                    # new_peptides.peptideslist.append(val[-1][0])
                    # new_peptides.peptideslist[-1].qval = val[-2]
                    # new_peptides.spectrumlist.append(val[-1][1])
        # new_peptides.spectrumlist = np.array(new_peptides.spectrumlist)
        new_peptides.check_arrays()
        if drop_decoy:
            new_peptides.filter_decoy()
        return (new_peptides, best_cut_evalue, best_cut_peptscore)
Exemple #3
0
def filter_evalue_prots(prots, FDR=1.0, remove_decoy=True, dec_prefix='DECOY_'):

    proteins = prots.items()

    isdecoy = lambda x: x[0].startswith(dec_prefix)
    escore = lambda x: float(x[1]['expect'])
    filtered_proteins = aux.filter(proteins, fdr=float(FDR) / 100, key=escore, is_decoy=isdecoy,
                                   remove_decoy=False, formula=1, full_output=True)
    qvals_e = aux.qvalues(filtered_proteins, key=escore, is_decoy=isdecoy, reverse=False, remove_decoy=False, formula=1,
                          full_output=True)
    new_prots = {}
    for val in qvals_e:
        val[-1][1]['qval'] = val[-2]
        if (not remove_decoy or not val[-1][0].startswith(dec_prefix)):
            new_prots[val[-1][0]] = val[-1][1]
    logger.info('Actual protein-level FDR = %.2f%%', aux.fdr(filtered_proteins, is_decoy=isdecoy) * 100)
    return new_prots
Exemple #4
0
def process_files(args):
    """Run Scavager for multiple files (`args['file']` should be a list of file names)
    and possibly for their union.

    Parameters
    ----------

    args : dict
        A dictionary of parameters as produced from argparse in :py:func:`search.run`.

    Returns
    -------
    out : int
        Exit code. 0 for success, 1 for empty (union) result, negative for errors
        (first encountered error code is returned).
    """
    files = args['file']
    N = len(files)
    logger.info('%d file(s) to process.', N)
    cargs = args.copy()
    if args['union'] or args['quick_union']:
        if not args['database']:
            logger.error('--database is required for union calculation.')
            return -101
    if args['create_pepxml'] and pepxmltk is None:
        logger.error(
            'pepxmltk is required for --create-pepxml. Please install it.')
        return -102
    if args['database']:
        decoy_prots_2 = utils.split_fasta_decoys(args['database'],
                                                 args['prefix'], args['infix'])
    else:
        decoy_prots_2 = None
        logger.info(
            'Database file not provided. Decoy randomization will be done per PSM file.'
        )
    errors = 0
    retvalues = []
    if not args['quick_union']:
        for f in files:
            cargs['file'] = f
            retv = process_file(cargs, decoy2=decoy_prots_2)
            retvalues.append(retv)
            if -10 < retv < 0:
                logger.info('Stopping due to previous errors.')
                return retv
            if retv < 0:
                errors += 1
    else:
        logger.info('Skipping individual file processing.')
    if N == 1:
        return retv
    if errors >= N - 1:
        if N > 1:
            logger.info(
                'Union will not be run because %s out of %s files were processed with errors.',
                errors, N)
        return retvalues
    if args['union'] or args['quick_union']:
        logger.info('Starting the union calculation...')
        psm_full_dfs = []
        for file in files:
            outfolder = utils.get_output_folder(args['output'], file)
            outbasename = utils.get_output_basename(file, args['name_suffix'])
            csvname = utils.filename(outfolder, outbasename, 'psm_full')
            try:
                df = pd.read_csv(csvname, sep='\t')
                for key in [
                        'protein', 'peptide_next_aa', 'peptide_prev_aa',
                        'num_tol_term', 'protein_descr', 'modifications',
                        'mods_counter'
                ]:
                    df[key] = df[key].apply(ast.literal_eval)
                psm_full_dfs.append(df)
            except FileNotFoundError:
                logger.warning('File %s not found, skipping...', csvname)
        all_psms = pd.concat(psm_full_dfs, sort=False)
        all_psms.reset_index(inplace=True, drop=True)
        logger.debug(
            'Recovered PSMs for analysis: %s, of those: %s decoy1, %s decoy2, %s have q < %s',
            all_psms.shape, all_psms.decoy1.sum(), all_psms.decoy2.sum(),
            (all_psms['q'] < args['fdr'] / 100).sum(), args['fdr'] / 100)
        utils.prepare_mods(all_psms)
        q_label = 'q'
        if not (args['no_correction'] or args['force_correction']):
            logger.info('Using the corrected q-values for union.')
            all_psms_f2 = all_psms[(~all_psms['decoy1'])
                                   & (all_psms[q_label] < args['fdr'] / 100)]
            if not all_psms_f2.shape[0]:
                q_label = 'q_uncorrected'
                logger.info('No union results with correction. Disabling...')
        if args['no_correction']:
            q_label = 'q_uncorrected'
        logger.debug('Filtering union PSMs by %s.', q_label)
        all_psms_f2 = all_psms.loc[(~all_psms['decoy1']) & (
            all_psms[q_label] < args['fdr'] / 100)].copy()

        peptides, peptides_f, proteins, proteins_f, protein_groups = build_output_tables(
            all_psms,
            all_psms_f2,
            decoy_prots_2,
            args,
            'PEP',
            calc_qvals=False)
        if peptides is None:
            logger.warning('No peptides identified in union.')
            retvalues.append(1)
            return retvalues

        logger.debug('Protein FDR in full table: %f%%',
                     100 * aux.fdr(proteins, is_decoy='decoy2'))

        write_tables(outfolder, 'union' + args['name_suffix'], all_psms,
                     all_psms_f2, peptides_f, proteins_f, protein_groups)
        if args['create_pepxml']:
            pepxmltk.easy_write_pepxml(
                files, utils.filename(outfolder, 'union', 'pepxml'),
                set(all_psms_f2.loc[~all_psms_f2['decoy2'], 'spectrum']))

        if len(all_psms_f2[~all_psms_f2['decoy2']]) >= 3:
            plot_outfigures(all_psms,
                            all_psms_f2[~all_psms_f2['decoy2']],
                            peptides,
                            peptides_f[~peptides_f['decoy2']],
                            outfolder,
                            'union' + args['name_suffix'],
                            df_proteins=proteins,
                            df_proteins_f=proteins_f[~proteins_f['decoy2']],
                            separate_figures=args['separate_figures'])

        logger.info('Union calculation complete.')
        retvalues.append(0)
    return retvalues
Exemple #5
0
 def _run_check(self, psms, **kwargs):
     is_decoy = kwargs.pop('is_decoy', self.is_decoy)
     pep = kwargs.pop('pep', self.pep)
     self.assertAlmostEqual(aux.fdr(psms, is_decoy=is_decoy, formula=1), 1.0)
     self.assertAlmostEqual(aux.fdr(psms, is_decoy=is_decoy, formula=2), 1.0)
     self.assertAlmostEqual(aux.fdr(psms, pep=pep), 0.0355)