def extract_deamids(mod_str): mod_list = [ mod.strip() for mod in mod_str.split(',') ] return pd.Series( ms.parse_spectrum_modifications(mod) for mod in mod_list if bool(deamid.search(mod)) )
# enumerating all tractable peptides from pep_df ... glyco_mod = [] for uniq_pept, pept_pos, prot_sr in pep_df[['pept','peptide_start','prot_seqrec']].itertuples(index=False): prot_seq = str(prot_sr) pept_spectrum = spec_info[ spec_info['pept']==uniq_pept ] # let's check all present modifications in the flatten-out list of lists ... modifs = [ mod for cmod in pept_spectrum['Variable modifications identified by spectrum'] for mod in cmod.strip().split(',') ] # and get those that are unique ... modifs = np.unique(modifs) # now extract type,position and value for each of them ... modifs = [ ms.parse_spectrum_modifications(mod) for mod in modifs if bool(deamid.search(mod)) ] # now extrating meaningfull glycosilation sites ... glyco_sites = [] glyco_start = [] # looks like the inner loop here is the only place where we do need 0-based indexing switching ... for type_aa,gpos_pept,value in modifs: if (type_aa in ['n','N']) and (np.abs(value-3)<0.1): # 'pept_pos' - is 1-based absolute poisition of the peptide in the protein ... # 'gpos_pept' - is 1-based relative position of gsite_start_N in the peptide ... gsite_start = pept_pos + gpos_pept-1 # 1-based coordinate ... gsite_stop = pept_pos + gpos_pept-1 + 3-1 # 1-based coordinate ... glyco_start.append(gsite_start) # Due to slicing rules, we need [start-1:stop], no have position 'stop' included ... glyco_sites.append(prot_seq[gsite_start-1:gsite_stop]) ############################################################ # gstart must be 1-based for output ...