Beispiel #1
0
 def get_RC(self):
     try:
         seqs = [pept.modified_sequence for pept in self.peptideslist]
         RTexp = self.RT_exp#[pept.RT_exp for pept in self.peptideslist]
         RC_def = achrom.RCs_gilar_rp
         RC_def['aa'].setdefault('U', RC_def['aa'].get('C', 0.0))
         RC_def['aa'].setdefault('O', RC_def['aa'].get('K', 0.0))
         aa_labels = set(RC_def['aa'].keys())
         for pept in self.peptideslist:
             for v in pept.modification_list.itervalues():
                 aa_labels.add(v)
         xdict = {}
         for key, val in RC_def['aa'].items():
             xdict[key] = [val, None]
         RC_dict = achrom.get_RCs_vary_lcp(seqs, RTexp, labels=aa_labels)
         for key, val in RC_dict['aa'].items():
             try:
                 xdict[key][1] = val
             except:
                 xdict[key] = [None, val]
         a, b, _, _ = aux.linear_regression([x[0] for x in xdict.values() if all(v != None for v in x)], [x[1] for x in xdict.values() if all(v != None for v in x)])
         for key, x in xdict.items():
             if x[1] == None:
                 x[1] = x[0] * a + b
             RC_dict['aa'][key] = x[1]
         if 'C' not in RC_dict['aa']:
             RC_dict['aa']['C'] = RC_dict['aa']['C*']
     except:
         logger.error('Error in get_RC for achrom model. Using RCs_gilar_rp')
         RC_dict = achrom.RCs_gilar_rp
     self.RC = RC_dict
Beispiel #2
0
 def get_calibrate_coeff(self):
     peptides = []
     peptides_added = {}
     for peptide, RT_exp, RT_predicted in izip(self.peptideslist, self.RT_exp, self.RT_predicted):
         if peptide.sequence not in peptides_added:
             peptides_added[peptide.sequence] = [RT_exp, ]
             peptides.append([RT_predicted, RT_exp])
         else:
             if any(abs(RT_exp - v) < 2 for v in peptides_added[peptide.sequence]):
                 pass
             else:
                 peptides_added[peptide.sequence].append(RT_exp)
                 peptides.append([RT_predicted, RT_exp])
     aux_RT = aux.linear_regression([val[0] for val in peptides], [val[1] for val in peptides])
     return aux_RT
Beispiel #3
0
def rt_filtering(results, settings):
    settings = settings.copy()
    if settings.has_option('misc', 'legend'):
        legend = settings.get('misc', 'legend')
    else:
        legend = None
    RTexp, seqs = zip(*[(utils.get_RT(res['spectrum']),
                         res['candidates'][0][1]) for res in results])
    if legend is not None:
        stdl = set(parser.std_labels)
        newseqs = []
        for s in seqs:
            if parser.fast_valid(s):
                newseqs.append(list(s))
            else:
                seq = []
                c, n = False, False
                for c in s:
                    if c in stdl:
                        seq.append(c)
                    else:
                        mod, res, term = legend[c]
                        if res == '-':
                            if term == '[':
                                seq.append(mod + '-')
                                n = True
                            else:
                                seq.append('-' + mod)
                                c = True
                        else:
                            seq.append(mod + res)
                    if not n: seq.append(parser.std_nterm)
                    if not c: seq.append(parser.std_cterm)
                newseqs.append(seq)
        seqs = newseqs
    RTexp = [float(x) for x in RTexp]
    if np.allclose(RTexp, 0):
        logger.warning('RT is missing. Skipping RT optimization.')
        return settings
    RC_def = achrom.RCs_gilar_rp
    xdict = {}
    for key, val in RC_def['aa'].items():
        xdict[key] = [val, None]
    RC_dict = utils.get_RCs_vary_lcp(seqs, RTexp)
    RC_dict_new = dict()
    for key, val in RC_dict['aa'].items():
        xdict.setdefault(key, [val, None])[1] = val
    a, b, _, _ = aux.linear_regression(
        [x[0] for x in xdict.values() if x[1] != None],
        [x[1] for x in xdict.values() if x[1] != None])
    for key, x in xdict.items():
        if x[1] == None:
            x[1] = x[0] * a + b
        RC_dict_new[key] = x[1]
    if legend is not None:
        for k, v in legend.items():
            if len(k) == 1: continue
            if k[-1] in '[]':
                if k[-2] == '-':
                    kk = ('-' + k[1:-1]) if k[-1] == ']' else (k[:-1])
                else:
                    kk = k[:-1]
            elif len(k) > 1:
                kk = k
            logger.debug('%s -> %s', k, kk)
            if kk in RC_dict_new:
                RC_dict_new[v] = RC_dict_new[kk]
            else:
                if kk[-1].isupper():
                    kkk = kk[-1]
                elif kk[-1] == '-':
                    kkk = parser.std_nterm
                elif kk[0] == '-':
                    kkk = parser.std_cterm
                RC_dict_new[v] = RC_dict_new.get(kkk, 0)
                logger.info('No RC for %s, using %s or 0: %s', kk, kkk,
                            RC_dict_new[v])

    RC_dict['aa'] = RC_dict_new

    logger.debug('RC dict: %s', RC_dict)
    rtexp = np.array([np.mean(x) for x in RTexp])
    rttheor = np.array(
        [calculate_RT(pep, RC_dict, raise_no_mod=False) for pep in seqs])
    deltaRT = rtexp - rttheor
    logger.debug('Linear regression: %s',
                 aux.linear_regression(rtexp, rttheor))
    best_RT_l = scoreatpercentile(deltaRT, 0.05)
    best_RT_r = scoreatpercentile(deltaRT, 99.95)

    def condition(spectrum, cand, _, stored_value=False):
        if not stored_value:
            stored_value = calculate_RT(cand, RC_dict)
        rtd = spectrum['RT'] - stored_value
        return best_RT_l <= rtd <= best_RT_r, stored_value

    settings.set('scoring', 'condition', condition)
    return settings
Beispiel #4
0
plt.figure()
plt.hist([peptide['m/z'] for peptide in peptides], bins=2000, range=(0, 4000))
plt.xlabel('m/z, Th')
plt.ylabel('# of peptides within 2 Th bin')

plt.figure()
plt.hist([peptide['charge'] for peptide in peptides], bins=20, range=(0, 10))
plt.xlabel('charge, e')
plt.ylabel('# of peptides')

x = [peptide['RT_RP'] for peptide in peptides]
y = [peptide['RT_normal'] for peptide in peptides]
heatmap, xbins, ybins = np.histogram2d(x, y, bins=100)
heatmap[heatmap == 0] = np.nan
a, b, r, stderr = auxiliary.linear_regression(x, y)

plt.figure()
plt.imshow(heatmap)
plt.xlabel('RT on RP, min')
plt.ylabel('RT on normal phase, min')
plt.title('All tryptic peptides, RT correlation = {0}'.format(r))

x = [peptide['m/z'] for peptide in peptides]
y = [peptide['RT_RP'] for peptide in peptides]
heatmap, xbins, ybins = np.histogram2d(x,
                                       y,
                                       bins=[150, 2000],
                                       range=[[0, 4000], [0, 150]])
heatmap[heatmap == 0] = np.nan
a, b, r, stderr = auxiliary.linear_regression(x, y)
Beispiel #5
0
def prepare_dataframe(infile_path,
                      decoy_prefix=None,
                      decoy_infix=False,
                      cleavage_rule=False,
                      fdr=0.01,
                      decoy2set=None):
    if not cleavage_rule:
        cleavage_rule = parser.expasy_rules['trypsin']
    if infile_path.lower().endswith(
            '.pep.xml') or infile_path.lower().endswith('.pepxml'):
        df1 = pepxml.DataFrame(infile_path)
        ftype = 'pepxml'
    elif infile_path.lower().endswith('.mzid'):
        df1 = mzid.DataFrame(infile_path)
    else:
        raise WrongInputError()
    if not df1.shape[0]:
        raise EmptyFileError()

    if 'Morpheus Score' in df1.columns:
        df1 = df1[df1['Morpheus Score'] != 0]
        df1['expect'] = 1 / df1['Morpheus Score']
        df1['num_missed_cleavages'] = df1['peptide'].apply(
            lambda x: parser.num_sites(x, rule=cleavage_rule))

    if 'MS-GF:EValue' in df1.columns:
        # MSGF search engine
        ftype = 'msgf'
        df1['peptide'] = df1['PeptideSequence']
        df1['num_missed_cleavages'] = df1['peptide'].apply(
            lambda x: parser.num_sites(x, rule=cleavage_rule))
        df1['assumed_charge'] = df1['chargeState']
        df1['spectrum'] = df1['spectrumID']
        df1['massdiff'] = (
            df1['experimentalMassToCharge'] -
            df1['calculatedMassToCharge']) * df1['assumed_charge']
        df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[
            'chargeState'] - df1['chargeState'] * 1.00727649
        df1['protein'] = df1['accession']
        df1['protein_descr'] = df1['protein description']
        df1['expect'] = df1['MS-GF:EValue']

    if set(df1['protein_descr'].str[0]) == {None}:
        # MSFragger
        logger.debug('Adapting MSFragger DataFrame.')
        logger.debug('Proteins before: %s', df1.loc[1, 'protein'])
        protein = df1['protein'].apply(
            lambda row: [x.split(None, 1) for x in row])
        df1['protein'] = protein.apply(lambda row: [x[0] for x in row])
        try:
            df1['protein_descr'] = protein.apply(
                lambda row: [x[1] for x in row])
        except IndexError:
            df1['protein_descr'] = protein.apply(lambda row: ['' for x in row])
        logger.debug('Proteins after: %s', df1.loc[1, 'protein'])

    # if any(None in set(df1['protein_descr'].str[0])):
    #     print('HERE')
    #     df1['protein_descr'] = df1.apply(lambda x: x['protein_descr'] if x['protein_descr'] else x['protein'], axis=1)
    df1.loc[pd.isna(df1['protein_descr']),
            'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']),
                                       'protein']
    # try:
    #     df1['expect'] = 1.0 / df1['bions_score_neg'].values
    # except:
    #     pass

    df1 = df1[~pd.isna(df1['peptide'])]
    if 'MS1Intensity' not in df1:
        df1['MS1Intensity'] = 0.0
    df1['length'] = df1['peptide'].apply(len)
    df1 = df1[df1['length'] >= 6]
    df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0])
    if 'retention_time_sec' not in df1.columns:
        if 'scan start time' in df1.columns:
            df1['RT exp'] = df1['scan start time']
            df1 = df1.drop([
                'scan start time',
            ], axis=1)
        else:
            df1['RT exp'] = 0
    else:
        df1['RT exp'] = df1['retention_time_sec'] / 60
        df1 = df1.drop([
            'retention_time_sec',
        ], axis=1)

    df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0)))
    df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] *
                                 1.003354) / df1['calc_neutral_pep_mass']

    df1['decoy'] = df1['protein'].apply(is_decoy,
                                        decoy_prefix=decoy_prefix,
                                        decoy_infix=decoy_infix)
    if not df1.decoy.sum():
        raise NoDecoyError()
    if decoy2set is None:
        decoy2set = split_decoys(df1)
    else:
        df1['decoy2'] = df1['protein'].apply(
            lambda p: all(x in decoy2set for x in p))
        df1['decoy1'] = df1['decoy'] & (~df1['decoy2'])
    df1 = remove_column_hit_rank(df1)

    if ftype == 'pepxml':
        df1['mods_counter'] = df1.apply(parse_mods, axis=1)
    elif ftype == 'msgf':
        df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1)
    prepare_mods(df1)

    pep_ratio = df1['decoy2'].sum() / df1['decoy'].sum()
    df1_f = filter_custom(df1[~df1['decoy1']],
                          fdr=fdr,
                          key='expect',
                          is_decoy='decoy2',
                          reverse=False,
                          remove_decoy=False,
                          ratio=pep_ratio,
                          formula=1,
                          correction=None,
                          loglabel='PSMs default')
    num_psms_def = df1_f[~df1_f['decoy2']].shape[0]
    logger.info(
        'Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d',
        num_psms_def)
    try:
        logger.info('Calibrating retention model...')
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            retention_coefficients = achrom.get_RCs_vary_lcp(
                df1_f['peptide'].values, df1_f['RT exp'].values)
        df1_f['RT pred'] = df1_f['peptide'].apply(
            lambda x: calc_RT(x, retention_coefficients))
        df1['RT pred'] = df1['peptide'].apply(
            lambda x: calc_RT(x, retention_coefficients))
        _, _, r_value, std_value = aux.linear_regression(
            df1_f['RT pred'], df1_f['RT exp'])
        logger.info('RT model training results: R^2 = %f , std = %f',
                    r_value**2, std_value)
        df1['RT diff'] = df1['RT pred'] - df1['RT exp']
        logger.info('Retention model calibrated successfully.')
    except Exception:
        logger.warning('Retention times are probably missing in input file.')
        df1['RT pred'] = df1['peptide'].apply(
            lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa))
        df1['RT diff'] = df1['RT exp']
    return df1, decoy2set
Beispiel #6
0
 def test_linear_regression_no_y_arr(self):
     x = np.array(list(zip(self.x, self.y)))
     result = aux.linear_regression(x)
     self._test_linreg(result)
Beispiel #7
0
 def test_linear_regression_no_y_list(self):
     x = list(zip(self.x, self.y))
     result = aux.linear_regression(x)
     self._test_linreg(result)
Beispiel #8
0
 def test_linear_regression_simple(self):
     result = aux.linear_regression(self.x, self.y)
     self._test_linreg(result)