-
Notifications
You must be signed in to change notification settings - Fork 0
/
subdivide.py
113 lines (100 loc) · 3.97 KB
/
subdivide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from __future__ import division
import itertools as it
import numpy as np
import utils as ut
import evidence as ev
import ml
import pairdict as pd
import elution as el
empty_val = 'Empty'
class SubDivider(object):
def __init__(self, arr, cluster, elutfs, pd_feats=None, elmax=None,
avg_ev=0.2):
self.pd_clust_feats, self.feat_names = (pd_evidences(cluster, arr) if pd_feats is None
else pd_feats)
self.elut_max = (elut_gene_maxes(elutfs, cluster) if elmax is None else
elmax)
self.cluster = cluster
self.avg_ev = avg_ev
def go_split(self):
# all combs of 1/0 assignments for each member of the cluster
n = len(self.cluster)
all_ins_outs = [np.binary_repr(x,width=n) for x in range(2**n)]
raw_scores = [self.score_split(zip(self.cluster, ins_outs)) for
ins_outs in all_ins_outs]
print raw_scores
scores = to_z_avg_wempties(raw_scores)
scored_splits = zip(scores, all_ins_outs)
return scored_splits
def score_split(self, members):
"""
members: list of [(id1, incomplex), ...]
"""
return [self.score_pair(i,j) for i,j in it.combinations(members,2)]
def score_pair(self, i_i_in, j_j_in):
i,i_in = i_i_in; j,j_in = j_j_in
i_in, j_in = int(i_in), int(j_in)
if i_in and j_in:
return self.score_together(i,j)
elif i_in or j_in:
return self.score_apart(i,j)
else:
return empty_val
def score_together(self, i, j):
"""
Looking only for evidence they're together in ANY contexts, regardles of
wehther they're also not in others >> clip to include only positive values.
"""
pair = self.pd_clust_feats.find((i,j))
return sum(np.clip(self.pd_clust_feats.d[pair],0,1000)) if pair else 0
def score_apart(self, gi, gj, cutoff=2):
pair = self.pd_clust_feats.find((gi,gj))
if pair:
evs = self.pd_clust_feats.d[pair]
else:
evs = [0]*len(self.feat_names)
cumsum = 0
for elutf in self.elut_max:
maxi,maxj = [self.elut_max[elutf].get(x,0) for x in gi,gj]
if maxi >= cutoff and maxj >= cutoff:
use_inds = [ind for ind,name in enumerate(self.feat_names) if
frac_name(name) == ut.shortname(elutf)]
cumsum += sum([self.avg_ev-evs[ind] for ind in use_inds])
#elif maxi > cutoff or maxj > cutoff:
#pass
return cumsum
#else:
#return 0
def frac_name(score_name):
return '_'.join(score_name.split('_')[:-1])
def to_z_avg_wempties(lol):
def flat(lol):
return [x for l in lol for x in l if x!=empty_val]
mu,sd = np.mean(flat(lol)), np.std(flat(lol))
def zscore_wempties(x, mu, sd):
return 0 if x==empty_val else (x-mu)/sd
newlol = [[zscore_wempties(x, mu, sd) for x in l] for l in lol]
return [np.mean(l) for l in newlol]
def elut_gene_maxes(elutfs, geneids):
d = {}
for f in elutfs:
e = el.load_elution(f)
prots_inv = ut.list_inv_to_dict(e.prots)
for gid in geneids:
if gid in prots_inv:
d.setdefault(f,{})[gid] = np.max(e.mat[prots_inv[gid]])
return d
def pd_evidences(cluster,arr):
arr_clust = arr[[i for i,r in enumerate(arr)
if r[0] in cluster and r[1] in cluster]]
# doesn't seem right--if most the interactions are strong, shouldn't
# normalize them down--should still count. but does it matter since it's
# all comparative? messes with thinking about the clipping though in
# score_together.
#features, _ = ml.normalize(ml.arr_feats(arr_clust))
sps = ut.config()['elut_species'].split('_')
names = [n for n in arr.dtype.names[3:] if n[:2] in sps]
features = arr[names]
pd_ev = pd.PairDict([[r[0],r[1]] + list(features[i]) for i,r in
enumerate(arr_clust)])
return pd_ev, names