def get_RC(self): try: seqs = [pept.modified_sequence for pept in self.peptideslist] RTexp = self.RT_exp#[pept.RT_exp for pept in self.peptideslist] RC_def = achrom.RCs_gilar_rp RC_def['aa'].setdefault('U', RC_def['aa'].get('C', 0.0)) RC_def['aa'].setdefault('O', RC_def['aa'].get('K', 0.0)) aa_labels = set(RC_def['aa'].keys()) for pept in self.peptideslist: for v in pept.modification_list.itervalues(): aa_labels.add(v) xdict = {} for key, val in RC_def['aa'].items(): xdict[key] = [val, None] RC_dict = achrom.get_RCs_vary_lcp(seqs, RTexp, labels=aa_labels) for key, val in RC_dict['aa'].items(): try: xdict[key][1] = val except: xdict[key] = [None, val] a, b, _, _ = aux.linear_regression([x[0] for x in xdict.values() if all(v != None for v in x)], [x[1] for x in xdict.values() if all(v != None for v in x)]) for key, x in xdict.items(): if x[1] == None: x[1] = x[0] * a + b RC_dict['aa'][key] = x[1] if 'C' not in RC_dict['aa']: RC_dict['aa']['C'] = RC_dict['aa']['C*'] except: logger.error('Error in get_RC for achrom model. Using RCs_gilar_rp') RC_dict = achrom.RCs_gilar_rp self.RC = RC_dict
def pyteomcis_snippets(traindf, trainy, testdf, nomod=True): """ """ #pyteomics clf = achrom.get_RCs_vary_lcp([str(i).replace("U", "C") for i in traindf], trainy) print("Pyteomics LCP: {}".format(clf["lcp"])) yhat_train = [ achrom.calculate_RT(i, clf, raise_no_mod=nomod) for i in traindf ] yhat_test = [ achrom.calculate_RT(i, clf, raise_no_mod=nomod) for i in testdf ] return (yhat_train, yhat_test)
def rt_prediction(calib_file, finalMS1_df): print ' from:', calib_file calib_df = pd.read_excel(calib_file) reten_seq = [str(x) for x in calib_df['Sequence']] reten_peak = calib_df['PepRtimePeak'].tolist() np.warnings.filterwarnings('ignore') RCs = achrom.get_RCs_vary_lcp(reten_seq, reten_peak) initRT_tuple = [] for idx, row in finalMS1_df.iterrows(): pept = str(row['pept']) rt = achrom.calculate_RT(pept, RCs) initRT_tuple.append((idx, pept, rt)) reten_start = calib_df['PepRtimeStart'] reten_end = calib_df['PepRtimeEnd'] initRT_width = np.mean((reten_end - reten_start).values) reten_validate = [] for seq in reten_seq: rt_v = achrom.calculate_RT(seq, RCs) reten_validate.append(rt_v) predict_coef = np.corrcoef(reten_peak, reten_validate) return initRT_tuple, initRT_width
def prepare_dataframe(infile_path, decoy_prefix=None, decoy_infix=False, cleavage_rule=False, fdr=0.01, decoy2set=None): if not cleavage_rule: cleavage_rule = parser.expasy_rules['trypsin'] if infile_path.lower().endswith( '.pep.xml') or infile_path.lower().endswith('.pepxml'): df1 = pepxml.DataFrame(infile_path) ftype = 'pepxml' elif infile_path.lower().endswith('.mzid'): df1 = mzid.DataFrame(infile_path) else: raise WrongInputError() if not df1.shape[0]: raise EmptyFileError() if 'Morpheus Score' in df1.columns: df1 = df1[df1['Morpheus Score'] != 0] df1['expect'] = 1 / df1['Morpheus Score'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) if 'MS-GF:EValue' in df1.columns: # MSGF search engine ftype = 'msgf' df1['peptide'] = df1['PeptideSequence'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) df1['assumed_charge'] = df1['chargeState'] df1['spectrum'] = df1['spectrumID'] df1['massdiff'] = ( df1['experimentalMassToCharge'] - df1['calculatedMassToCharge']) * df1['assumed_charge'] df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[ 'chargeState'] - df1['chargeState'] * 1.00727649 df1['protein'] = df1['accession'] df1['protein_descr'] = df1['protein description'] df1['expect'] = df1['MS-GF:EValue'] if set(df1['protein_descr'].str[0]) == {None}: # MSFragger logger.debug('Adapting MSFragger DataFrame.') logger.debug('Proteins before: %s', df1.loc[1, 'protein']) protein = df1['protein'].apply( lambda row: [x.split(None, 1) for x in row]) df1['protein'] = protein.apply(lambda row: [x[0] for x in row]) try: df1['protein_descr'] = protein.apply( lambda row: [x[1] for x in row]) except IndexError: df1['protein_descr'] = protein.apply(lambda row: ['' for x in row]) logger.debug('Proteins after: %s', df1.loc[1, 'protein']) # if any(None in set(df1['protein_descr'].str[0])): # print('HERE') # df1['protein_descr'] = df1.apply(lambda x: x['protein_descr'] if x['protein_descr'] else x['protein'], axis=1) df1.loc[pd.isna(df1['protein_descr']), 'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']), 'protein'] # try: # df1['expect'] = 1.0 / df1['bions_score_neg'].values # except: # pass df1 = df1[~pd.isna(df1['peptide'])] if 'MS1Intensity' not in df1: df1['MS1Intensity'] = 0.0 df1['length'] = df1['peptide'].apply(len) df1 = df1[df1['length'] >= 6] df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0]) if 'retention_time_sec' not in df1.columns: if 'scan start time' in df1.columns: df1['RT exp'] = df1['scan start time'] df1 = df1.drop([ 'scan start time', ], axis=1) else: df1['RT exp'] = 0 else: df1['RT exp'] = df1['retention_time_sec'] / 60 df1 = df1.drop([ 'retention_time_sec', ], axis=1) df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0))) df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] * 1.003354) / df1['calc_neutral_pep_mass'] df1['decoy'] = df1['protein'].apply(is_decoy, decoy_prefix=decoy_prefix, decoy_infix=decoy_infix) if not df1.decoy.sum(): raise NoDecoyError() if decoy2set is None: decoy2set = split_decoys(df1) else: df1['decoy2'] = df1['protein'].apply( lambda p: all(x in decoy2set for x in p)) df1['decoy1'] = df1['decoy'] & (~df1['decoy2']) df1 = remove_column_hit_rank(df1) if ftype == 'pepxml': df1['mods_counter'] = df1.apply(parse_mods, axis=1) elif ftype == 'msgf': df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1) prepare_mods(df1) pep_ratio = df1['decoy2'].sum() / df1['decoy'].sum() df1_f = filter_custom(df1[~df1['decoy1']], fdr=fdr, key='expect', is_decoy='decoy2', reverse=False, remove_decoy=False, ratio=pep_ratio, formula=1, correction=None, loglabel='PSMs default') num_psms_def = df1_f[~df1_f['decoy2']].shape[0] logger.info( 'Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d', num_psms_def) try: logger.info('Calibrating retention model...') with warnings.catch_warnings(): warnings.simplefilter("ignore") retention_coefficients = achrom.get_RCs_vary_lcp( df1_f['peptide'].values, df1_f['RT exp'].values) df1_f['RT pred'] = df1_f['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) _, _, r_value, std_value = aux.linear_regression( df1_f['RT pred'], df1_f['RT exp']) logger.info('RT model training results: R^2 = %f , std = %f', r_value**2, std_value) df1['RT diff'] = df1['RT pred'] - df1['RT exp'] logger.info('Retention model calibrated successfully.') except Exception: logger.warning('Retention times are probably missing in input file.') df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa)) df1['RT diff'] = df1['RT exp'] return df1, decoy2set