/
Pairwise.py
275 lines (232 loc) · 11 KB
/
Pairwise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
'''
Get python modules
'''
import cPickle
from itertools import chain
'''
Get third-party modules
'''
from fisher import pvalue as fisher
'''
Get MOCA modules
'''
from Statistics import contingency_table, p_adjust, interaction, Performance, correlation, correlation_pvalue, EffectSize
from DataHandler import load_data, get_supervised_dataset, get_path
def make_report(Labels, Features, Arguments, Supervised=False):
'''
This report is retrieved for all MOCA results files when 'Reports = Results' is set in the
Arguments file. Tells you from which input data the results were generated, which file the
results were written to, and what parameters were used during the calculation.
'''
Report = {}
#Use the 'Entries' key to set the order of output when 'Reports = Results' is run.
Report["Entries"] = ["Pairwise", "Input data", "Label count", "Supervised", "Correction method",
"FDR", "Number of significant interactions", "Continuous-valued correlation"]
Report["Input data"] = [Data for Data in Arguments.Data]
Report["Label count"] = len(Labels)
Report["Correction method"] = Arguments.CorrectionMethod
Report["FDR"] = Arguments.FDR
Report["Pairwise"] = Arguments.Pairwise
Report["Supervised"] = Supervised #just initialized it
if Supervised: Report["Supervised"] = [True, "(" + Supervised + ")"] #then overwrite the key if True
Report["Number of significant interactions"] = len(Features)
if Arguments.UntransformedData:
Report["Continuous-valued correlation"] = True
else:
Report["Continuous-valued correlation"] = False
return Report
def unsupervised(Arguments):
'''
Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to
so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.).
In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for
a single datatype.
'''
if len(Arguments.Data) > 2:
print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
print "'Data = ' field. Exiting..."
exit()
Data = load_data(Arguments)
Features = list(chain(*Data.Transformed.Features.values()))
Variates = list(chain(*Data.Transformed.Variates.values()))
if len(Arguments.Data) == 1:
Features1 = Features
Features2 = Features
if len(Arguments.Data) == 2:
Features1 = Data.Transformed.Features[Arguments.Data[0]]
Features2 = Data.Transformed.Features[Arguments.Data[1]]
PValues = {}
Interactions = {}
SampleCounts = {}
CaseCounts = {} #just the positive class here
Performances = {}
EffectSizes = {}
Tested = []
for Feature1 in Features1:
Tested.append(Feature1)
for Feature2 in Features2:
if Feature2 not in Tested:
a,b,c,d = contingency_table(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)],
NA=Arguments.NA)
PValue = fisher(a,b,c,d)
PValues[tuple([Feature1, Feature2])] = PValue.two_tail
Interactions[tuple([Feature1, Feature2])] = interaction(PValue)
SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d
CaseCounts[tuple([Feature1, Feature2])] = a + c
#A placeholder solely to make pairwise post-processing generalizable
Performances[tuple([Feature1, Feature2])] = "NA"
EffectSizes[tuple([Feature1, Feature2])] = "NA"
FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
for Pair, PValue in PValues.items():
if FDRs[PValue] < Arguments.FDR:
pass
else:
PValues.pop(Pair, None)
Interactions.pop(Pair, None)
SampleCounts.pop(Pair, None)
CaseCounts.pop(Pair, None)
Performances.pop(Pair, None)
EffectSizes.pop(Pair, None)
Results = {}
Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments)
Results["PValues"] = PValues
Results["Interactions"] = Interactions
Results["FDRs"] = FDRs
Results["SampleCounts"] = SampleCounts
Results["CaseCounts"] = CaseCounts
Results["Performances"] = Performances
Results["EffectSizes"] = EffectSizes
if Arguments.Filename.lower() == "default":
Pickle = "_".join(["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin),
Arguments.CorrectionMethod])
else:
Pickle = Arguments.Filename
cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
return
def supervised(Arguments):
'''
MOCA pairwise calculations executed if a 'Phenotype' is provided in the Arguments file. Not technically
supervised 'learning', as there is no optimization (every possible pairwise comparison is tested).
Output includes perfomance metrics such as sensitivity, specificity, PPV, and NPV, for each features
ability to predict the phenotype.
'''
Labels, Features, Variates, Phenotypes, Markers = get_supervised_dataset(Arguments)
#Clustermode support if called
Node = int(Arguments.MultiProcessMode[0])
TotalNodes = int(Arguments.MultiProcessMode[1])
for Phenotype in Phenotypes[Node::TotalNodes]:
PValues = {}
Interactions = {}
Performances = {}
SampleCounts = {}
CaseCounts = {} #just the postive class here
EffectSizes = {}
for Marker in Markers:
TP,FP,FN,TN = contingency_table(Variates[Features.index(Marker)], Variates[Features.index(Phenotype)],
NA=Arguments.NA)
PValue = fisher(TP,FP,FN,TN)
PValues[Marker] = PValue.two_tail
Interaction = interaction(PValue)
Interactions[Marker] = Interaction
Performances[Marker] = Performance(Interaction, TP,FP,FN,TN)
EffectSizes[Marker] = EffectSize(Interaction, TP,FP,FN,TN)
SampleCounts[Marker] = TP + FP + FN + TN
CaseCounts[Marker] = TP + FN
FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
for Marker in Markers:
FDR = FDRs[PValues[Marker]]
if FDR < Arguments.FDR:
pass
else:
PValues.pop(Marker, None)
Interactions.pop(Marker, None)
Performances.pop(Marker, None)
SampleCounts.pop(Marker, None)
CaseCounts.pop(Marker, None)
EffectSizes.pop(Marker, None)
if len(PValues.keys()):
Results = {}
Results["Report"] = make_report(Labels, PValues.keys(), Arguments, Supervised=Phenotype[:Phenotype.index(":")])
Results["PValues"] = PValues
Results["Interactions"] = Interactions
Results["Performances"] = Performances
Results["FDRs"] = FDRs
Results["SampleCounts"] = SampleCounts
Results["CaseCounts"] = CaseCounts
Results["EffectSizes"] = EffectSizes
if Arguments.Filename.lower() == "default":
DataTypes = set(Arguments.Data).difference(set([Arguments.Phenotype]))
Pickle = "_".join(["Pairwise", "Phenotype=" + Phenotype[:Phenotype.index(":")],
"_".join(sorted(DataTypes)), str(Arguments.FeatureMin), Arguments.CorrectionMethod])
else:
Pickle = Arguments.Filename + "_" + Phenotype[:Phenotype.index(":")]
cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
return
def pairwise_continuous(Arguments):
'''
'''
if len(Arguments.Data) > 2:
print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
print "'Data = ' field. Exiting..."
exit()
Data = load_data(Arguments)
Features = list(chain(*Data.Features.values()))
Variates = list(chain(*Data.Variates.values()))
if Arguments.Phenotype:
Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature]
Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature]
else:
if len(Arguments.Data) == 1:
Features1 = Features
Features2 = Features
if len(Arguments.Data) == 2:
Features1 = Data.Features[Arguments.Data[0]]
Features2 = Data.Features[Arguments.Data[1]]
PValues = {}
Correlations = {}
Tested = []
for Feature1 in Features1:
Tested.append(Feature1)
for Feature2 in Features2:
if Feature2 not in Tested:
PValues[tuple([Feature1, Feature2])] = correlation_pvalue(Variates[Features.index(Feature1)],
Variates[Features.index(Feature2)])
Correlations[tuple([Feature1, Feature2])] = correlation(Variates[Features.index(Feature1)],
Variates[Features.index(Feature2)])
FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
for Pair, PValue in PValues.items():
if FDRs[PValue] < Arguments.FDR:
pass
else:
PValues.pop(Pair, None)
Correlations.pop(Pair, None)
if len(PValues.keys()):
Results = {}
Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype)
Results["PValues"] = PValues
Results["Correlations"] = Correlations
Results["FDRs"] = FDRs
if Arguments.Filename.lower() == "default":
Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod])
else:
Pickle = Arguments.Filename
cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
return
def pairwise(Arguments):
'''
If you supply a phenotype, then pairwise MOCA is simply selecting features that
are significantly co-occurring or mutually exclusive with the phenotype. If no
phenotype is specified, MOCA will compare every possible pairwise interaction in the
provided data.
'''
if Arguments.UntransformedData:
pairwise_continuous(Arguments)
elif Arguments.Phenotype:
supervised(Arguments)
else:
unsupervised(Arguments)
return