/
stage3_gsites_catalog.py
383 lines (310 loc) · 14.3 KB
/
stage3_gsites_catalog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
import os
import sys
from Bio import Seq
from Bio import SeqIO
from Bio import SeqRecord
import pandas as pd
import numpy as np
import ms_module as ms
import re
############################
from StringIO import StringIO
#
import argparse
#
#
# HOW TO LAUNCH THIS THING ...
# %run stage3_gsites_catalog.py --prefix ../raw_data/New_files_to_analyze/011216\ glycocapture\ 90-90 -m pept_prot_map.csv -g pulled_proteins.gb -s specs.xls
#
# do some arguments parsing to make the script looks civilized ...
parser = argparse.ArgumentParser()
parser.add_argument("-e","--exp_num",
help="specify number of an experiment",required=True)
parser.add_argument("-m","--pept_prot_map",
help="specify file name of peptide summary with unique of fetchids of matching proteins (with/without path)",required=True)
parser.add_argument("-g","--genbank",
help="specify file name of genbank records with pulled proteins (with/without path)",required=True)
parser.add_argument("-s","--spec_summary", help="speicfy spectrum file name (with/without path)",required=True)
parser.add_argument("--prefix", help="specify common part of the path for peptide and spectrum files")
parser.add_argument("--separator", help="speicfy separator type in the input data",default='tab')
args = parser.parse_args()
#
###############################################
if args.prefix is not None:
pep_map_fname = os.path.join( args.prefix, args.pept_prot_map )
spec_fname = os.path.join( args.prefix, args.spec_summary )
gb_fname = os.path.join( args.prefix, args.genbank )
else:
pep_map_fname = args.pept_prot_map
spec_fname = args.spec_summary
gb_fname = args.genbank
# get the common path for later use ...
common_path = os.path.commonprefix([pep_map_fname,spec_fname,gb_fname])
common_path = os.path.dirname(common_path)
#
# Reading genbank mindfully next ...
gbrecs = ms.genebank_fix_n_read(gb_fname,key_func_type='id')
######################################
# assign some module internal stuff ...
ms.gbrecs = gbrecs
#
# separator type choice is needed only for the ORIGINAL input files ...
if args.separator == "tab":
separator = '\t'
elif args.separator == "comma":
separator = ','
else:
separator = '\t'
#################
pep_info = pd.read_csv(pep_map_fname)
spec_info = pd.read_csv(spec_fname,sep=separator)
# fix their peptide sequence thing right away ...
spec_info['pept'] = spec_info['Peptide sequence'].str.upper()
pep_info['fetchid'] = pep_info['fetchid'].apply(int)
# this is an UGLY fix that we'd have to implement here just to save everything ...
if args.exp_num=='1':
pep_info['enzyme'] = 'T'
#
# fasta = SeqIO.to_dict(SeqIO.parse(fasta_fname,"fasta"),key_function=lambda _: _.id.split('|')[1])
# 1-BASED NOTATION FOR PROTEINS INDEXING ENFORCED ...
# pep_df = pd.read_csv(uniq_pept_fname)
# connection between peptide info and spectrum info to be established ...
##########################################################################
# unroll that spec table to have 1 deamid per row ...
spec_info_unrolled = ms.unroll_by_mfunc(spec_info,'Variable modifications identified by spectrum',ms.extract_deamids,'deamid_info')
if spec_info_unrolled['Protein identification probability'].dtype != 'float':
spec_info_unrolled['prot_ident_probab'] = spec_info_unrolled['Protein identification probability'].str.strip('%').apply(float)
else:
spec_info_unrolled['prot_ident_probab'] = spec_info_unrolled['Protein identification probability']
####################################################################################################################
if spec_info_unrolled['Peptide identification probability'].dtype != 'float':
spec_info_unrolled['pept_ident_probab'] = spec_info_unrolled['Peptide identification probability'].str.strip('%').apply(float)
else:
spec_info_unrolled['pept_ident_probab'] = spec_info_unrolled['Peptide identification probability']
####################################################################################################################
# so far the following merge seems to be 100% sufficient for the desired final output ...
# we could add on extra features if needed ...
spec_n_pep = spec_info_unrolled[['pept',
'deamid_info',
'prot_ident_probab',
'pept_ident_probab']].merge(pep_info,how='right',on='pept',suffixes=('','_x'))
# Now, extract those gsites ...
dg_func = lambda x: pd.Series( ms.deamid_to_gsite(x['deamid_info'], x['start_fetched'], str(gbrecs[x['fetchacc']].seq)) )
# and add them back to the main table ...
gs_res = spec_n_pep[['deamid_info','start_fetched','fetchacc']].apply( dg_func, axis=1 )
spec_n_pep = spec_n_pep.merge(gs_res,left_index=True,right_index=True)
print
print "Now we'd need to add theoretical glycosilation sites as a separate column ..."
print "full protein sequence and its length is added as well ..."
# this analysis must be done, once for each 'fetchid', and then merged back to the main table ...
get_theor_sites_fid = lambda facc: ms.get_theor_sites(str(gbrecs[str(facc)].seq))
get_theor_sites_number_fid = lambda facc: ms.get_theor_sites_number(str(gbrecs[str(facc)].seq))
theor_sites_info = lambda facc: pd.Series(
{'fetchacc':facc,
'gsites_predicted':get_theor_sites_fid(facc),
'gsites_predicted_number':get_theor_sites_number_fid(facc),
'prot_seq':str(gbrecs[str(facc)].seq),
'prot_len':len(str(gbrecs[str(facc)].seq))} )
###################################################
predicted_gsite_info = spec_n_pep['fetchacc'].drop_duplicates().apply(theor_sites_info)
# add back to the main table ...
spec_n_pep = spec_n_pep.merge(predicted_gsite_info,on='fetchacc',how='right')
print "done ..."
print "numbering appears to be 1-based and overall correct!"
print
# print " 'gsites_predicted' column uses 1-based numbering. Enforced and checked."
# SOME FINAL PREPARATIONS TO COMPLY WITH THE REQUESTED FORMAT ...
# extract gsite AAs as separate columns ...
spec_n_pep['gsite_AA1'] = spec_n_pep['gsite_seq'].str.get(0)
spec_n_pep['gsite_AA2'] = spec_n_pep['gsite_seq'].str.get(1)
spec_n_pep['gsite_AA3'] = spec_n_pep['gsite_seq'].str.get(2)
# locus protein_name uid Protein_ID_PERCENT peptides best_peptide Peptide_probability protease Expt_NUMBER prev_aa next_aa pept_start pept_stop Location match g_site gsite_start gsites_AA1_N gsites_AA2_XbutP gsites_AA3_TS Best Mascot Ion score Best Mascot Identity score Best Mascot Delta Ion score Prot_seq signal signal_loc tm_span protein length
requested_cols = ['gsite',
'pept',
'enzyme',
'start_fetched',
'prot_name',
'fetchid',
'fetchacc',
'uid_fetched',
'GN_fetched',
'pept_ident_probab',
'gsites_predicted',
'gsites_predicted_number',
'gsite_seq',
'gstart',
'gsite_AA1',
'gsite_AA2',
'gsite_AA3',
'signal',
'signal_loc',
'tm_span']
requested_cols = ['locus',
'prot_name',
'uid_fetched',
'prot_ident_probab',
# 'peptides', # THIS IS NEW STUFF ...
'pept',
'fetchid',
'fetchacc',
# 'best_pept', # THIS IS NEW STUFF ...
'pept_ident_probab', # BEWARE, pept ID % of the BEST PEPTIDE ...
'enzyme',
# 'experiment_num', # THIS IS NEW STUFF ...
###########################
'prev_aa',
'next_aa',
'prev_aa_fetched',
'next_aa_fetched',
'pept_start',
'pept_stop',
'start_fetched',
'stop_fetched',
###########################
'gsite_seq',
'gstart',
'gsite_AA1',
'gsite_AA2',
'gsite_AA3',
'Best Mascot Ion Score',
'Best Mascot Identity Score',
'Best Mascot Delta Ion Score',
'prot_seq', # PROTEIN SEQUENCE TO BE ADDED ...
'signal',
'signal_loc',
'tm_span',
'prot_len', # PROTEIN LENGTH TO BE ADDED ...
'SCORE',
'crit_pept_in']
# ###################################################
# # TO BE CONTINUED ...
THE_MOST_FINAL_DF = spec_n_pep[requested_cols].drop_duplicates().reset_index(drop=True)
# choose peptide with highest Pept_ident_probab
# Let's collpase (gsite,pept,fetchid) using the highest pept_ident_probab ...
THE_MOST_FINAL_DF_max_prob = THE_MOST_FINAL_DF.loc[THE_MOST_FINAL_DF.groupby(['gsite_seq','gstart','pept','fetchid','fetchacc','enzyme'],sort=False)['pept_ident_probab'].idxmax() ].reset_index(drop=True)
##################################################
# TO BE CONTINUED ....
#################################################
# BEST PEPTIDE CHOSING IS WORKING, FINISH ALL PEPTIDES AND OTHER STUFF ...
def choose_best_pept(df,dp_margin=1.0):
# pandas 0.18.1 broke something here: "TypeError: Series.name must be a hashable type" ...
# we're fixing it, introducing one more layer of control here ...
assert 'pept' in df.columns
assert 'pept_ident_probab' in df.columns
# this should work on any df with the 2 specified columns ...
if df.shape[0]>1:
shortes_idx = df['pept'].str.len().idxmin()
max_prob_idx = df['pept_ident_probab'].idxmax()
if df['pept_ident_probab'][shortes_idx] < (df['pept_ident_probab'][max_prob_idx]-dp_margin):
# return df['pept'][max_prob_idx]
# print "returning max_prob_idx",max_prob_idx
return max_prob_idx
else:
# return df['pept'][shortes_idx]
# print "returning shortes_idx",shortes_idx
return shortes_idx
else:
# print "returning one thing", df['pept_ident_probab'].idxmax()
return df['pept_ident_probab'].idxmax()
def unstack_pept(series):
return ','.join(series)
# fff=THE_MOST_FINAL_DF_max_prob.groupby(['gsite_seq','gstart','locus','enzyme'],sort=False).size()
# fff[fff>1]
THE_MOST_FINAL_DF_max_prob.groupby(['gsite_seq', 'gstart', 'locus', 'enzyme'],sort=False)['pept'].apply(unstack_pept).reset_index(drop=True)
# choosing BEST PEPTIDE containing given gsite
# according to the Reid's rule of best peptide ...
THE_MOST_FINAL_DF_uniq_pept = \
THE_MOST_FINAL_DF_max_prob.loc[
THE_MOST_FINAL_DF_max_prob.groupby(['gsite_seq',
'gstart',
'locus',
'enzyme'],sort=False).apply(choose_best_pept)
].reset_index(drop=True)
# given the best peptides are choosen, unstacking multiple peptides corresponding to the same gsite
# and forming a column 'peptides' ...
THE_MOST_FINAL_DF_uniq_pept['peptides'] = \
THE_MOST_FINAL_DF_max_prob.groupby(['gsite_seq',
'gstart',
'locus',
'enzyme'],sort=False)['pept'].apply(unstack_pept).reset_index(drop=True)
# rename pept to best_pept AND enzyme to protease ...
THE_MOST_FINAL_DF_uniq_pept = THE_MOST_FINAL_DF_uniq_pept.rename(columns={'pept':'best_pept','enzyme':'protease',})
# add experiment number, something new ...
THE_MOST_FINAL_DF_uniq_pept['exp_num'] = int(args.exp_num)
# location match instead of fetched/Scaffold-based resutls ...
THE_MOST_FINAL_DF_uniq_pept['loc_match'] = THE_MOST_FINAL_DF_uniq_pept['pept_start'] == THE_MOST_FINAL_DF_uniq_pept['start_fetched']
THE_MOST_FINAL_DF_uniq_pept['loc_match'] = THE_MOST_FINAL_DF_uniq_pept['loc_match'].map({True:'Y',False:'N'})
requested_cols = ['locus',
'prot_name',
'uid_fetched',
'prot_ident_probab',
'peptides', # THIS IS NEW STUFF ...
# 'pept',
# 'fetchid',
'fetchacc',
'best_pept', # THIS IS NEW STUFF ...
'pept_ident_probab', # BEWARE, pept ID % of the BEST PEPTIDE ...
'protease',
'exp_num', # THIS IS NEW STUFF ...
###########################
'prev_aa',
'next_aa',
# 'prev_aa_fetched',
# 'next_aa_fetched',
'pept_start',
'pept_stop',
'loc_match',
# 'start_fetched',
# 'stop_fetched',
###########################
'gsite_seq',
'gstart',
'gsite_AA1',
'gsite_AA2',
'gsite_AA3',
'Best Mascot Ion Score',
'Best Mascot Identity Score',
'Best Mascot Delta Ion Score',
'prot_seq', # PROTEIN SEQUENCE TO BE ADDED ...
'signal',
'signal_loc',
'tm_span',
'prot_len', # PROTEIN LENGTH TO BE ADDED ...
'SCORE',
'crit_pept_in']
THE_MOST_FINAL_DF_uniq_pept[requested_cols].to_csv(os.path.join(common_path,'FINAL_gsite_anthology.csv'),index=False)
# THE_MOST_FINAL_DF_uniq.to_csv(os.path.join(common_path,'FINAL_gsite_anthology.csv'),index=False)
# # DESIREd COLUMNS ...
# # ############################################
# # # columns that needs to be delivered ... #
# # ############################################
# # # A gsites, 1 per line
# # # B pept, 1 per line
# # # B1 enzyme, G or T, derive from 'Biological sample category', like this: {'TrypsinSample1':'T','GluC_Sample2':'G'}
# # # C peptide_start, 1 per line accordingly
# # # D all_uids, REPLACE WITH col:H
# # # E prot_seq, try to get those from NCBI, not from UniProt ...
# # # F protein, ??? sequence, name or what???
# # # G uid_max, UID for major form instead or something like that ...
# # # H prot_name, parsed out human-readable name from 'Protein name'
# # # H1 gene_name, parsed out GN=xxx from 'Protein name'
# # # I uniq_peptide_count, discrad that column ...
# # # J pept_probability, output number not the string - this would be the criteria
# # # K gsites_predicted, OK
# # # L gsites_predicted_number, OK
# # # M gsite_start, beware of 0 or 1 type of indexing ...
# # # N,O,P - gsites AAs in separate columns
# # # M1, NOP combined, gsite sequence basically!
# # # Q signal, from GeneBank record on the protein, simply Y,N on whether there is a 'Signal' in gb.
# # # R signal_location, location of the signal from Q
# # # S tm_span, Y,N just for the fact of having TM span as a protein feature.
# # THINGS TO BE DONE:
# # 1) MERGE PEP_INFO WITH SPEC_INFO, SO THAT PEPT-PROT RELATIONSHIP WOULD BE SET UP ...
# # probably getting read of many-many columns in the spec_info along the way ...
# # 2) MODIFY AND TEST THE 'deamid_to_gsite' FUNCTION (PAY ATTENTION TO 1-BASED AND 0-BASED NUMBERING OF AAs)
# # 3) USE 'deamid_to_gsite' TO FINALLY EXTRACT GSITES ...
# # 4) ANALYSE(?) GSITES: GROUPBY UNIQUE GSITES (GSITE[seq],GSITE_START,PROT_IDENTIFICATOR) TO SEE HOW MANY PEPTS YIELD THE SAME GSITES ...
# # 5) SELECT A SINGLE PEPT PER GSITE??????? USING REAID'S CRITERIA, OR LEAVE ALL GSITE-PEPTS PAIRS ???????????
# #######################################
# # 6) COMPLY WITH THE #columns that needs to be delivered...# FOR THE VERY VERY FINAL OUTPUT ...
# #######################################