/
orth.py
153 lines (137 loc) · 4.92 KB
/
orth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from __future__ import division
import itertools as it
import os
import utils as ut
keys = "Hs-Ce Hs-Dd Hs-Dm Hs-Mm Hs-Nv Hs-Sp Hs-Sc Hs_uni-Ce_uni Ce-Dm Ce-Mm Ce-Nv Ce-Sp Sp-Dm Sp-Mm Sp-Nv Mm-Dm Dm-Nv Hs-Xl Hs-X2 Hs-X3 Hs-Pf Hs-Tg".split()
def odict(from_sp, to_sp):
"""
Load a dict from file, eg:
{HsProt1: set([CeProtA, CeProtB,...]), ...}
"""
if from_sp != to_sp:
fname, swap_order = orth_fname(from_sp, to_sp)
ogroups = load_ogroups(from_sp, to_sp)
# change: ogroups already swapped 10/30/2013
return _ogroups_to_odict(ogroups, swap_order=False)
else:
return None
def orth_fname(from_sp, to_sp):
key = from_sp + '-' + to_sp
if key in keys:
swap_order=False
else:
key = to_sp + '-' + from_sp
if key in keys:
swap_order=True
else:
assert False, "Orthogroup key %s not in keys list" % key
fname = ut.proj_path('convert_orth', 'table.'+key)
return fname, swap_order
def odict_1to1(from_sp, to_sp):
"""
Filtered flat odict with only 1to1 orthologs.
"""
od = odict(from_sp, to_sp)
od_rev = odict(to_sp, from_sp)
return dict([(k,list(v)[0]) for k,v in od.items() if len(v)==1 and
len(od_rev[list(v)[0]])==1])
def convert_dict_single(fromtype, totype):
"""
totype: must be Sp (eg 'Hs') or Sp_seqdb
Returns None if not necessary or not found.
"""
if len(totype.split('_')) > 1:
# Get rid of the 2nd half of totype if it's default for that species
tosp, toseqdb = totype.split('_')
if toseqdb == ut.config()[tosp+'_default']:
totype = tosp
if fromtype == totype:
return None
elif len(fromtype) == len(totype) == 2:
return odict(fromtype, totype)
else:
return custom_conversion(fromtype, totype)
def convert_dict(fromtype, totype):
"""
First looks for single conversion step. If not found, splits it up.
Returns None if not necessary or not found.
"""
conv1 = convert_dict_single(fromtype, totype)
if conv1:
return conv1
else:
# If we made it here, try first converting to second species,
# then looking for other conversion.
conv1 = convert_dict_single(fromtype, totype[:2])
conv2 = convert_dict_single(totype[:2], totype)
if conv1 and conv2:
return ut.compose_dict_sets(conv1,conv2)
def all_odicts(sp, sps):
d_odicts = {}
for other in sps:
if sp!=other:
d_odicts[other] = odict(sp,other)
return d_odicts
def all_ogroup_sizes(fromsp, tosps):
odicts = all_odicts(fromsp, tosps)
ogsizes = {}
for othersp, od in odicts.items():
ogsizes[othersp] = ogroup_size_dict(od)
return ogsizes
def ogroup_size_dict(odict):
"""
Takes a normal odict of fromid: set(toids) and returns a dict of fromid:
size of that side of the orthogroup.
"""
ogsize = {}
odinv = ut.dict_inverse_sets(odict)
for fromid in odict:
# Can just use the first one since orthogroups are cohesive
ogsize[fromid] = len(odinv[list(odict[fromid])[0]])
return ogsize
def custom_conversion(fromtype, totype):
"""
Check for a custom file in data/convert
Return None if not found.
"""
fname = "%s2%s.tab" % (fromtype, totype)
fpath = ut.proj_path('convert',fname)
if os.path.exists(fpath):
return ut.load_dict_sets(fpath)
def _ogroups_to_odict(ogroups, swap_order=False):
"""
From a list of orthogroups, return a dict from sp1 prots to a set of sp2
prots. We want a dictionary from the first species in the file to the second,
unless swap_order is True.
"""
sp1col = 1 if swap_order else 0
sp2col = 0 if swap_order else 1
orthdict = dict([(p1,set([p2 for p2 in og[sp2col]])) for og in ogroups for
p1 in og[sp1col]])
return orthdict
def load_ogroups(from_sp, to_sp, fname=None):
"""
Load an inparanoid table.Sp1-Sp2 file into a list of orthogroups, where
each orthogroup is a tuple containing 1) a list of proteins in sp1 and 2) a
list of proteins in sp2.
Eg: [([HsProtA, HsProtB,..],[CeProtA,CeProtB,..]), ([..],[..]), ...]
"""
# Skip header row; protein ids alternate with meaningless conf scores in
# columns 2 and 3 in the order of the filename
if fname is None:
fname, swap_order = orth_fname(from_sp, to_sp)
else:
fname, swap_order = fname, False
(from_ind, to_ind) = (2,3) if not swap_order else (3,2)
ogroups = [([p for p in row[from_ind].split()[::2]],[p for p in
row[to_ind].split()[::2]]) for row in ut.load_tab_file(fname)][1:]
return ogroups
def orth_pairs(p, od):
"""
p: a ppi pair of ids
od: an orth.odict; None means same species, so just returns what it's given
"""
if od is not None:
return it.product(od[p[0]],od[p[1]]) if p[0] in od and p[1] in od else []
else:
return [p]