Exemple #1
0
 def get_RC(self):
     try:
         seqs = [pept.modified_sequence for pept in self.peptideslist]
         RTexp = self.RT_exp#[pept.RT_exp for pept in self.peptideslist]
         RC_def = achrom.RCs_gilar_rp
         RC_def['aa'].setdefault('U', RC_def['aa'].get('C', 0.0))
         RC_def['aa'].setdefault('O', RC_def['aa'].get('K', 0.0))
         aa_labels = set(RC_def['aa'].keys())
         for pept in self.peptideslist:
             for v in pept.modification_list.itervalues():
                 aa_labels.add(v)
         xdict = {}
         for key, val in RC_def['aa'].items():
             xdict[key] = [val, None]
         RC_dict = achrom.get_RCs_vary_lcp(seqs, RTexp, labels=aa_labels)
         for key, val in RC_dict['aa'].items():
             try:
                 xdict[key][1] = val
             except:
                 xdict[key] = [None, val]
         a, b, _, _ = aux.linear_regression([x[0] for x in xdict.values() if all(v != None for v in x)], [x[1] for x in xdict.values() if all(v != None for v in x)])
         for key, x in xdict.items():
             if x[1] == None:
                 x[1] = x[0] * a + b
             RC_dict['aa'][key] = x[1]
         if 'C' not in RC_dict['aa']:
             RC_dict['aa']['C'] = RC_dict['aa']['C*']
     except:
         logger.error('Error in get_RC for achrom model. Using RCs_gilar_rp')
         RC_dict = achrom.RCs_gilar_rp
     self.RC = RC_dict
Exemple #2
0
def pyteomcis_snippets(traindf, trainy, testdf, nomod=True):
    """
    """
    #pyteomics
    clf = achrom.get_RCs_vary_lcp([str(i).replace("U", "C") for i in traindf],
                                  trainy)
    print("Pyteomics LCP: {}".format(clf["lcp"]))
    yhat_train = [
        achrom.calculate_RT(i, clf, raise_no_mod=nomod) for i in traindf
    ]
    yhat_test = [
        achrom.calculate_RT(i, clf, raise_no_mod=nomod) for i in testdf
    ]
    return (yhat_train, yhat_test)
Exemple #3
0
def rt_prediction(calib_file, finalMS1_df):
    print ' from:', calib_file
    calib_df = pd.read_excel(calib_file)
    reten_seq = [str(x) for x in calib_df['Sequence']]
    reten_peak = calib_df['PepRtimePeak'].tolist()
    np.warnings.filterwarnings('ignore')
    RCs = achrom.get_RCs_vary_lcp(reten_seq, reten_peak)
    initRT_tuple = []
    for idx, row in finalMS1_df.iterrows():
        pept = str(row['pept'])
        rt = achrom.calculate_RT(pept, RCs)
        initRT_tuple.append((idx, pept, rt))

    reten_start = calib_df['PepRtimeStart']
    reten_end = calib_df['PepRtimeEnd']
    initRT_width = np.mean((reten_end - reten_start).values)

    reten_validate = []
    for seq in reten_seq:
        rt_v = achrom.calculate_RT(seq, RCs)
        reten_validate.append(rt_v)

    predict_coef = np.corrcoef(reten_peak, reten_validate)
    return initRT_tuple, initRT_width
Exemple #4
0
def prepare_dataframe(infile_path,
                      decoy_prefix=None,
                      decoy_infix=False,
                      cleavage_rule=False,
                      fdr=0.01,
                      decoy2set=None):
    if not cleavage_rule:
        cleavage_rule = parser.expasy_rules['trypsin']
    if infile_path.lower().endswith(
            '.pep.xml') or infile_path.lower().endswith('.pepxml'):
        df1 = pepxml.DataFrame(infile_path)
        ftype = 'pepxml'
    elif infile_path.lower().endswith('.mzid'):
        df1 = mzid.DataFrame(infile_path)
    else:
        raise WrongInputError()
    if not df1.shape[0]:
        raise EmptyFileError()

    if 'Morpheus Score' in df1.columns:
        df1 = df1[df1['Morpheus Score'] != 0]
        df1['expect'] = 1 / df1['Morpheus Score']
        df1['num_missed_cleavages'] = df1['peptide'].apply(
            lambda x: parser.num_sites(x, rule=cleavage_rule))

    if 'MS-GF:EValue' in df1.columns:
        # MSGF search engine
        ftype = 'msgf'
        df1['peptide'] = df1['PeptideSequence']
        df1['num_missed_cleavages'] = df1['peptide'].apply(
            lambda x: parser.num_sites(x, rule=cleavage_rule))
        df1['assumed_charge'] = df1['chargeState']
        df1['spectrum'] = df1['spectrumID']
        df1['massdiff'] = (
            df1['experimentalMassToCharge'] -
            df1['calculatedMassToCharge']) * df1['assumed_charge']
        df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[
            'chargeState'] - df1['chargeState'] * 1.00727649
        df1['protein'] = df1['accession']
        df1['protein_descr'] = df1['protein description']
        df1['expect'] = df1['MS-GF:EValue']

    if set(df1['protein_descr'].str[0]) == {None}:
        # MSFragger
        logger.debug('Adapting MSFragger DataFrame.')
        logger.debug('Proteins before: %s', df1.loc[1, 'protein'])
        protein = df1['protein'].apply(
            lambda row: [x.split(None, 1) for x in row])
        df1['protein'] = protein.apply(lambda row: [x[0] for x in row])
        try:
            df1['protein_descr'] = protein.apply(
                lambda row: [x[1] for x in row])
        except IndexError:
            df1['protein_descr'] = protein.apply(lambda row: ['' for x in row])
        logger.debug('Proteins after: %s', df1.loc[1, 'protein'])

    # if any(None in set(df1['protein_descr'].str[0])):
    #     print('HERE')
    #     df1['protein_descr'] = df1.apply(lambda x: x['protein_descr'] if x['protein_descr'] else x['protein'], axis=1)
    df1.loc[pd.isna(df1['protein_descr']),
            'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']),
                                       'protein']
    # try:
    #     df1['expect'] = 1.0 / df1['bions_score_neg'].values
    # except:
    #     pass

    df1 = df1[~pd.isna(df1['peptide'])]
    if 'MS1Intensity' not in df1:
        df1['MS1Intensity'] = 0.0
    df1['length'] = df1['peptide'].apply(len)
    df1 = df1[df1['length'] >= 6]
    df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0])
    if 'retention_time_sec' not in df1.columns:
        if 'scan start time' in df1.columns:
            df1['RT exp'] = df1['scan start time']
            df1 = df1.drop([
                'scan start time',
            ], axis=1)
        else:
            df1['RT exp'] = 0
    else:
        df1['RT exp'] = df1['retention_time_sec'] / 60
        df1 = df1.drop([
            'retention_time_sec',
        ], axis=1)

    df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0)))
    df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] *
                                 1.003354) / df1['calc_neutral_pep_mass']

    df1['decoy'] = df1['protein'].apply(is_decoy,
                                        decoy_prefix=decoy_prefix,
                                        decoy_infix=decoy_infix)
    if not df1.decoy.sum():
        raise NoDecoyError()
    if decoy2set is None:
        decoy2set = split_decoys(df1)
    else:
        df1['decoy2'] = df1['protein'].apply(
            lambda p: all(x in decoy2set for x in p))
        df1['decoy1'] = df1['decoy'] & (~df1['decoy2'])
    df1 = remove_column_hit_rank(df1)

    if ftype == 'pepxml':
        df1['mods_counter'] = df1.apply(parse_mods, axis=1)
    elif ftype == 'msgf':
        df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1)
    prepare_mods(df1)

    pep_ratio = df1['decoy2'].sum() / df1['decoy'].sum()
    df1_f = filter_custom(df1[~df1['decoy1']],
                          fdr=fdr,
                          key='expect',
                          is_decoy='decoy2',
                          reverse=False,
                          remove_decoy=False,
                          ratio=pep_ratio,
                          formula=1,
                          correction=None,
                          loglabel='PSMs default')
    num_psms_def = df1_f[~df1_f['decoy2']].shape[0]
    logger.info(
        'Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d',
        num_psms_def)
    try:
        logger.info('Calibrating retention model...')
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            retention_coefficients = achrom.get_RCs_vary_lcp(
                df1_f['peptide'].values, df1_f['RT exp'].values)
        df1_f['RT pred'] = df1_f['peptide'].apply(
            lambda x: calc_RT(x, retention_coefficients))
        df1['RT pred'] = df1['peptide'].apply(
            lambda x: calc_RT(x, retention_coefficients))
        _, _, r_value, std_value = aux.linear_regression(
            df1_f['RT pred'], df1_f['RT exp'])
        logger.info('RT model training results: R^2 = %f , std = %f',
                    r_value**2, std_value)
        df1['RT diff'] = df1['RT pred'] - df1['RT exp']
        logger.info('Retention model calibrated successfully.')
    except Exception:
        logger.warning('Retention times are probably missing in input file.')
        df1['RT pred'] = df1['peptide'].apply(
            lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa))
        df1['RT diff'] = df1['RT exp']
    return df1, decoy2set