def expmat_construction(exp_file, exp_paramlist, charge_list): mslev = 1 for param in exp_paramlist: mm, mrange_min, mrange_max, mz_range, MZ_SCALE, \ tt, gradient_starttime, gradient_endtime, gradient_time, TIME_SCALE, \ window, shift = exp_paramlist[mslev] print ' from:', exp_file, ' using mass spectrogram from', gradient_starttime, 'to', gradient_endtime, 'minutes' exp_df = pd.read_pickle(exp_file) # transform exp_df_head from str to numeric exp_df_head = ['ind', 'mslev', 'bpmz', 'bpint', 'starttime'] for each in exp_df_head: exp_df[each] = pd.to_numeric(exp_df[each]) # drop out of range time exp_df = exp_df[exp_df['starttime'] >= gradient_starttime] exp_df = exp_df[exp_df['starttime'] < gradient_endtime] # combine array and its bp to list of float for bp, ar, combine in zip(['bpmz', 'bpint'], ['mzarray', 'intarray'], ['allmz', 'allint']): exp_df[combine] = exp_df[bp].apply(lambda x: [x]) + exp_df[ar] ## Create index exp_df['starttime'] = time_index(exp_df['starttime'], gradient_starttime, tt) exp_df['allmz'] = mz_index(exp_df['allmz'].values, mrange_min, mrange_max, mm) exp_df = exp_df[['ind', 'starttime', 'allmz', 'allint']] time_col = [] time_col_temp = [] for index, row in exp_df.iterrows(): # remove out of range m row['allint'] = [ i for m, i in zip(row['allmz'], row['allint']) if m >= 0 and m < MZ_SCALE ] row['allmz'] = [m for m in row['allmz'] if m >= 0 and m < MZ_SCALE] # use bincount to sum int at same mz_index to create time_index col with MZ_SCALE length timecol_array = np.bincount(row['allmz'], row['allint'], minlength=(MZ_SCALE)) timecol_array[timecol_array < 1] = 0 time_col_temp.append(timecol_array) # append each row, int sum if index % 500 == 0: time_col.extend(time_col_temp) time_col_temp = [] # flush last time_col.extend(time_col_temp) exp_df['allint_overlap'] = time_col expdf_row = np.tile(np.arange(MZ_SCALE), exp_df.shape[0]) expdf_col = np.repeat(exp_df['starttime'].values, MZ_SCALE) expdf_value = np.concatenate(exp_df['allint_overlap'].values) exp_mat = sparse.coo_matrix((expdf_value,\ (expdf_row, expdf_col)), \ shape=(MZ_SCALE, TIME_SCALE)) exp_mat = smoothingtime_mat(exp_mat, window, shift) exp_mat, mat_mean = rescale_mat(exp_mat) return exp_mat, exp_paramlist, mat_mean
def refMS1_construction(refms1_df, M_header, iso_header, charge_list, iso_maxnumber, globalparam_list, eps): mslev = 1 mm = globalparam_list[mslev][globalparam_list[0].index('mm')] mrange_min = globalparam_list[mslev][globalparam_list[0].index( 'mrange_min')] mrange_max = globalparam_list[mslev][globalparam_list[0].index( 'mrange_max')] MZ_SCALE = globalparam_list[mslev][globalparam_list[0].index('MZ_SCALE')] if np.all( refms1_df.prot.values != refms1_df.sort_values('prot').prot.values ): print('Warning: Prot is not alphabetically sorted') exit() mziso_df = refms1_df.rename_axis('pept_id') print(mziso_df.columns) # melt/pivot Mheader (all charge) into 'variable' col and its mz value into 'value' col ## so every line is a singly charged peptide with this iso head abundance .melt([dfkeep], pivotthing) mziso_df = mziso_df.reset_index().melt( ['prot', 'pept_id', 'pept', 'mod', 'modpept', 'rtpeak'] + iso_header, M_header) mziso_df = mziso_df.rename(columns={'value': 'mz', 'variable': 'charge'}) mziso_df['charge'] = mziso_df['charge'].str[-1].astype(int) mziso_df['mod'] = mziso_df['mod'].fillna('') # drop NA, sort same pept_id (same prot) up from small charge first mziso_df = mziso_df.dropna().sort_values(['pept_id', 'charge']) # reset index after correct sort to use as col mziso_df = mziso_df.rename_axis('tempidx').reset_index().drop('tempidx', 1) # print(mziso_df, mziso_df.shape) prot_peptcount = mziso_df['prot'].value_counts().sort_index().to_frame( name='ms1count') peptcount = prot_peptcount.values.sum() pept_ioncount = mziso_df['pept'].value_counts().sort_index().to_frame( name='ms1count') # print(' pept_ioncount:', pept_ioncount) mzidx_header = [] for idx, iso_head in enumerate(iso_header): mziso_df['mz_' + iso_head] = mziso_df['mz'].values + ( idx / mziso_df['charge'].values) mziso_df['mzidx_' + iso_head] = mz_index( mziso_df['mz_' + iso_head].values, mrange_min, mrange_max, mm) mzidx_header.append('mzidx_' + iso_head) mziso_df['final_mzidx'] = mziso_df[mzidx_header].values.tolist() mziso_df['final_normisoab'] = mziso_df[iso_header].values.tolist() if iso_maxnumber > 1: mziso_df_col = np.repeat(mziso_df.index.values, iso_maxnumber) mziso_df_row = np.concatenate(mziso_df['final_mzidx'].values) mziso_df_value = np.concatenate(mziso_df['final_normisoab'].values) else: # if no isotope mziso_df_col = np.array(mziso_df.index.values) mziso_df_row = np.array(mziso_df['mzidx_isoab0'].values) mziso_df_value = np.array(mziso_df['isoab0'].values) if mziso_df.index.values.tolist() != list( np.arange(len(mziso_df.index.values))): print('Warning mziso_df reindex is wrong') print(mziso_df.index.values) exit() nonzero_idx = np.multiply([mziso_df_row >= 0], [mziso_df_value > 0]) # keep True True nonzero_idx = tuple(nonzero_idx) sreference = sparse.coo_matrix((mziso_df_value[nonzero_idx],\ (mziso_df_row[nonzero_idx], mziso_df_col[nonzero_idx])), \ shape=(MZ_SCALE, peptcount)) sreference = normalize(sreference, norm='l1', axis=0) return sreference, mziso_df, prot_peptcount, pept_ioncount