/
score.py
executable file
·343 lines (309 loc) · 13.4 KB
/
score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
from __future__ import division
import sys
import itertools
import numpy as np
from scipy import spatial
import os
from scipy import sparse
from collections import defaultdict
import operator
import utils as ut
import elution as el
import orth
def score_array_multi(arr, sp_base, elut_fs, scores, cutoff, verbose=False,
remove_multi_base=False, gidscheme=None, allow_singles=True):
"""
- remove_multi_base: This is not the method currently used to filter scores
in cases of orthogroup fan-outs--this is a stricter earlier version. That
filter is feature.py: filter_multi_orths(), applied after scoring.
"""
assert gidscheme=='', "Gidscheme not implemented in scoring."
current_sp = ''
if remove_multi_base:
print ("Filtering orths: only single base gene in orthogroups.")
for e,f in [(el.load_elution(f),f) for f in elut_fs]:
sp_target = ut.shortname(f)[:2]
if sp_target != current_sp: # Just for status output
print "Starting first %s file: %s" % (sp_target, ut.shortname(f))
current_sp = sp_target
baseid2inds = orth_indices(sp_base, sp_target, e.prots,
remove_multi_base)
# singles based on original spec counts
singles = set([]) if allow_singles else prots_singles(e)
for score in scores:
if verbose: print score, f
score_array(arr, e, f, score, cutoff, baseid2inds, singles, lambda prots:
orth_indices(sp_base, sp_target, prots, remove_multi_base))
def orth_indices(sp_base, sp_target, prot_list, remove_multi_base):
"""
Using appropriate orthology, take a list of target species gene ids
(corresponding to rows in the target species score matrix), and
return a dict mapping base species gene ids to (sets of) indices in that
list and therefore to (sets of) row/column indices in the square
interaction score matrix.
"""
targ2inds = dict([(k,set([v]))
for k,v in ut.list_inv_to_dict(prot_list).items()])
if sp_base == sp_target:
return targ2inds
else:
base2targ = orth.odict(sp_base, sp_target)
if remove_multi_base:
base2targ = remove_multi_keys(base2targ)
base2inds = ut.compose_dict_sets(base2targ, targ2inds)
base2inds = dict([(k,v) for k,v in base2inds.items() if len(v)>0])
return base2inds
def remove_multi_keys(d, max_keys=1):
"""
Given a dict of key: set(vs), eliminate from the dict any keys that map to
the same set of vs.
"""
newd = d.copy()
dinv = ut.dict_inverse_sets(newd)
for k,vs in newd.items():
for v in vs:
if len(dinv[v]) > max_keys:
del newd[k]
break
return newd
def score_array(arr, elut, fname, score, cutoff, id2inds, singles_exclude,
recalc_id2inds):
"""
Use the target species score matrix to get interaction pair in the base
species array. Don't score and just leave as default (0 now) cases where
either: 1) One of the pair is not in this score matrix, or 2) The two base
ids in the pair map to identical targets, since in that case we also can
get no information from this data (see notes 2012.08.12).
Also exclude any proteins with just one total count in this elution.
- Recalc_id2inds: purpose is for remapping to the right indices in the case
of swiching out to a new elution file with a differently-ordered matrix.
This is currently only to handle the ms1 elution data.
"""
score_mat, new_id2inds, new_prots = scorekey_elution(score, elut, recalc_id2inds)
id2inds = new_id2inds or id2inds
prots = new_prots or elut.prots
score_name = name_score(fname,score)
for i,row in enumerate(arr):
id1,id2 = row['id1'],row['id2']
if id1 in id2inds and id2 in id2inds and id2inds[id1]!=id2inds[id2]:
inds1, inds2 = [id2inds[gid] for gid in id1,id2]
if len(singles_exclude) > 0:
inds1, inds2 = [remove_labeled(inds, prots, singles_exclude)
for inds in inds1,inds2]
if len(inds1)>0 and len(inds2)>0:
# Could also check for i!=j but would have no effect here since
# these mappings come from disjoint orthogroups.
row[score_name] = max([score_mat[i,j]
for i in inds1 for j in inds2])
def remove_labeled(ids, labels, set_remove):
return [i for i in ids if labels[i] not in set_remove]
def name_score(fname, score):
return ut.shortname(fname) + '_' + score
def prots_singles(elut):
"""
Using where to find proteins with only one count: messy but fast
"""
singles_inds = np.array(np.where(elut.mat.sum(axis=1) == 1)[0])[0]
return set(np.array(elut.prots)[singles_inds])
def scorekey_elution(score, elut, recalc_id2inds):
new_id2inds = None
new_prots = None
if score == 'apex':
score_mat = ApexScores(elut)
elif score == 'cosine_old':
score_mat = CosineLazyScores(elut)
elif score == 'cosine':
score_mat = CosineLazyNew(elut)
elif score == 'euclidean':
score_mat = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
metric=score)
elif score in ('pq_euc', 'pq_unfilt_euc', 'mq_euc'):
# Use pepquant specific elution file.
extension = ( '_pqmsb_filtmsb.tab' if score=='pq_euc' else
'_pqmsb.tab' if score=='pq_unfilt_euc' else
'.mq_Intensity.tab' if score=='mq_euc' else 0)
elut = el.load_elution(os.path.splitext(elut.filename)[0] + extension)
if recalc_id2inds is not None:
new_id2inds = recalc_id2inds(elut.prots) #cv framework (arrfeats)
new_prots = elut.prots
score_mat = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
metric='euclidean')
else:
fscore = elut.filename + (
'.corr_poisson' if score=='poisson' else
'.T.wcc_width1' if score=='wcc' else
'.corr_euclidean' if score=='euc_poisson' else
'.standard' if score=='standard' else # eg elution/testms1
0 ) # no score: exception since string and int don't add
score_mat = precalc_scores(fscore)
return score_mat, new_id2inds, new_prots
def traver_corr(mat, repeat=1000, norm='columns', verbose=True):
# As described in supplementary information in paper.
# Randomly draw from poisson(C=A+1/M) for each cell
# where A = the observed count and M is the total fractions
# normalize each column to sum to 1
# then correlate, and average together for repeat tries.
def poisson_corr(mat, iteration_display, norm):
if verbose: print iteration_display
M = mat.shape[1]
C = mat + 1/M
poisson_mat = np.matrix(np.zeros(C.shape))
for i in range(C.shape[0]):
for j in range(M):
poisson_mat[i,j] = np.random.poisson(C[i,j])
if norm=='columns':
poisson_mat = np.nan_to_num(poisson_mat / np.sum(poisson_mat, 0))
elif norm=='rows': # seems to make no performance difference 1/25
poisson_mat = np.nan_to_num(poisson_mat / np.sum(poisson_mat, 1))
corr = np.nan_to_num(np.corrcoef(poisson_mat))
return corr
avg_result = (reduce(operator.add, (poisson_corr(mat, i, norm=norm) for i in
range(repeat))) / repeat)
return avg_result
def pdist_score(mat, metric='euclidean', norm_rows=True,
norm_cols=True):
norm_mat = ut.normalize_fracs(mat, norm_rows, norm_cols)
dists = spatial.distance.pdist(norm_mat, metric=metric)
dist_mat = spatial.distance.squareform(dists)
score_mat = 1 - np.nan_to_num(dist_mat)
return score_mat
def poisson_repeat(mat, repeat=200, **kwargs):
# As described in supplementary information in paper.
# Randomly draw from poisson(C=A+1/M) for each cell
# where A = the observed count and M is the total fractions
# normalize each column to sum to 1
# then correlate, and average together for repeat tries.
def poisson_dist(mat, iteration_display, metric='cosine', norm_rows=True,
norm_cols=True, verbose=True):
if verbose: print iteration_display, metric
M = mat.shape[1]
C = mat + 1/M
poisson_mat = np.matrix(np.zeros(C.shape))
for i in range(C.shape[0]):
for j in range(M):
poisson_mat[i,j] = np.random.poisson(C[i,j])
score_mat = pdist_score(mat, metric=metric,
norm_rows=norm_rows, norm_cols=norm_cols)
return score_mat
avg_result = (reduce(operator.add, (poisson_dist(mat, i, **kwargs) for i in
range(repeat))) / repeat)
return avg_result
class ApexScores(object):
def __init__(self, elution):
self.apex_array = np.argmax(np.array(elution.mat), axis=1)
self.shape = (len(self.apex_array),len(self.apex_array))
def __getitem__(self, index):
return int(self.apex_array[index[0]] == self.apex_array[index[1]])
#def apex_scores_toarray(smat):
#arr = np.zeros(smat.shape)
#for r in range(smat.shape[0]):
#for c in range(smat.shape[1]):
#if smat[r,c]:
#arr[r,c] = smat[r,c]
#return arr
def apex_scores_toarray_fast(smat):
"""
Same output as above, but 0s on the diagonal.
"""
dmaxes = defaultdict(set)
for row, mx in enumerate(smat.apex_array):
dmaxes[mx].add(row)
arr = np.zeros(smat.shape)
for mx,rows in dmaxes.items():
for r1,r2 in itertools.permutations(rows,2):
arr[r1,r2] = 1
return arr
def precalc_scores(scoref, dtype='f2'):
"""
Also zero out the diagonal to more efficiently remove all self-interactions
up-front.
"""
# NOTE to change dtype you must change it in loadtxt below!!
save_compact = ut.config()['save_compact_corrs']
compactf = '%s.%s.pyd' % (scoref, dtype)
if os.path.exists(compactf):
mat = ut.loadpy(compactf)
inds = range(mat.shape[0]) # always square score matrix
mat[inds, inds] = 0
return mat
else:
ascores = np.loadtxt(scoref, dtype='f2')
if save_compact:
print 'saving compact', compactf
ut.savepy(ascores, compactf)
return ascores
class CosineLazyNew(object):
def __init__(self,elution):
self.norm_mat = np.mat(el.normalize_fracs(elution.mat))
def __getitem__(self, index):
# Dot product of normed rows
return coscore(self.norm_mat, index[0], index[1])
def coscore(mat, i, j):
#return 1 - dotrows(mat,i,j)/(dotrows(mat,i,i)**.5 * dotrows(mat,j,j)**.5)
return dotrows(mat,i,j)/(dotrows(mat,i,i)**.5 * dotrows(mat,j,j)**.5)
def dotrows(mat, i, j):
return np.asarray(mat[i,:]*mat[j,:].T)[0][0]
class CosineLazyScores(object):
def __init__(self,elution):
mat = elution.mat
norms = np.apply_along_axis(np.linalg.norm, 1, mat)
self.mat_rownormed = np.nan_to_num(mat / np.matrix(norms).T)
assert type(self.mat_rownormed) == type(np.matrix(''))
self.shape = (mat.shape[0],mat.shape[0])
def __getitem__(self, index):
# Dot product of normed rows
return float(self.mat_rownormed[index[0],:] *
self.mat_rownormed[index[1],:].T)
def matching_pairs(values, ids):
"""
Return all pairs of ids for indices in the given list whose values match.
Will not return identity matches since uses combinations.
"""
d = defaultdict(list)
for ind,val in enumerate(values):
d[val].append(ids[ind])
return [(i,j) for value in d for i,j in itertools.combinations(d[value],2)]
def pairs_exceeding(elut, skey, thresh):
"""
Doesn't return self-self interactions.
"""
arr_prots = np.array(elut.prots)
if skey == 'apex':
apexes = ApexScores(elut).apex_array
pairs = matching_pairs(apexes, arr_prots)
else: # loading precomputed indices is so far massively slower than this
score_mat, _, new_prots = scorekey_elution(skey, elut, None)
if new_prots is not None:
arr_prots = np.array(new_prots)
rows, cols = np.where(score_mat > thresh)
p1s, p2s = [arr_prots[ids] for ids in rows, cols]
pairs = ut.zip_exact(p1s, p2s)
return pairs
if __name__ == '__main__':
nargs = len(sys.argv)
if nargs < 3:
sys.exit("usage: python score.py filename method(poisson|dotproduct|corrcoef|cov) [argument]")
fname = sys.argv[1]
method = sys.argv[2]
methodarg = None if nargs < 4 else int(sys.argv[3])
elut = el.load_elution(fname)
if method == 'poisson':
corr = traver_corr(elut.mat, repeat=methodarg) if methodarg else \
traver_corr(elut.mat)
elif method in ['cosine_poisson','euclidean_poisson']:
corr = poisson_repeat(elut.mat, metric=method.split('_')[0],
repeat=methodarg) if methodarg else poisson_repeat(elut.mat,
metric=method)
elif method in ['euclidean']:
corr = pdist_score(elut.mat, norm_rows=True, norm_cols=True,
metric=method)
elif method in ['apex']:
corr = apex_scores_toarray_fast(ApexScores(elut))
#elif method == 'dotproduct':
#corr = elut.mat * elut.mat.T
#elif method == 'corrcoef':
#corr = np.corrcoef(elut.mat)
#elif method == 'cov':
#corr = np.cov(elut.mat)
fileout = fname+'.corr_'+method
np.savetxt(fileout, corr, delimiter='\t')