/
diff-exp.py
executable file
·407 lines (330 loc) · 10 KB
/
diff-exp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
#!/usr/bin/env python
#
# diff-exp.py
#
# shawn driscoll
# 20120928
#
# differential expression on count data across two conditions with
# or without biological replicates.
#
import sys,argparse
sys.path.append("/Users/pfafflab/coding/python")
import numpy as np
import bootstrap as bs
import de_utils
from math import log,sqrt,floor,ceil
#
# parse arguments
#
parser = argparse.ArgumentParser(description="Differential expression testing on count data across two conditions with or without replicates. Source files should have been produced by nexpr.")
parser.add_argument('condition_1_files',type=str,help="Comma separated list of files for condition 1")
parser.add_argument('condition_2_files',type=str,help="Comma separated list of files for condition 2")
parser.add_argument('-L',type=str,dest="labels",default="cond_1,cond_2",help="Conditions separated by comma (default: cond_1,cond_2)")
parser.add_argument('-k',type=float,dest='k_factor',default=0.5,help="Keep factor. The ratio of replicates from either condition that must have the minimum required hits. (default: 0.5)")
parser.add_argument('--sig-only',dest="sig_only",action="store_const",default=False,const=True,help="Return only significant genes (default: False)")
args = parser.parse_args()
#
# globals
#
MIN_COUNT = 10
NUM_BINS = 40
#
# main
#
def main(args):
#
# variables and initalizations
#
cond_1_files = args.condition_1_files.split(",")
cond_2_files = args.condition_2_files.split(",")
all_files = cond_1_files + cond_2_files
num_cond_1 = len(cond_1_files)
num_cond_2 = len(cond_2_files)
num_samples = num_cond_1 + num_cond_2
num_raw = 0
num_masked = 0
gene_info = []
gene_info_masked = []
gene_counts = None
gene_counts_masked = None
gene_counts_norm = None
gene_expr = None
gene_expr_masked = None
row_mask = None
i = 0
j = 0
k = 0
condition_labels = args.labels.split(",")
#
# check file counts
#
if num_cond_1 == 0 or num_cond_2 == 0:
sys.stderr.write("Error: missing input files!\n")
return 1
#
# check condition label counts
#
if len(condition_labels) != 2:
sys.stderr.write("Error: insufficient label count for conditions\n")
return 1
#
# load files
#
sys.stderr.write("> loading files...\n")
for i in range(num_samples):
try:
fp = open(all_files[i],"r")
except IOError,e:
sys.stderr.write("failed to open input file {:s}\n".format(all_files[i]))
return 1
# continue...
k = 0
for szl in fp:
ll = szl.strip().split("\t")
if i==0:
#
# first file! record info for this gene
#
gene_info.append([ll[0],ll[1],ll[2]])
# build matrix
if gene_counts is None:
gene_counts = np.zeros(num_samples,dtype=float)
gene_expr = np.zeros(num_samples,dtype=float)
else:
gene_counts = np.vstack((gene_counts,np.zeros(num_samples,dtype=float)))
gene_expr = np.vstack((gene_expr,np.zeros(num_samples,dtype=float)))
if len(np.shape(gene_counts)) == 1:
gene_counts[i] = float(ll[3])
gene_expr[i] = float(ll[4])
else:
gene_counts[k][i] = float(ll[3])
gene_expr[k][i] = float(ll[4])
k += 1
# close file
fp.close()
#
# files are loaded now we want to filter out low count genes by generating a mask
#
sys.stderr.write("> masking low count tags...\n")
row_mask = condition_mask(gene_counts,num_cond_1,num_cond_2,args.k_factor)
num_raw = np.shape(gene_counts)[0]
num_masked = int(np.sum(row_mask))
#
# make the masked copies of the data
#
for i in range(num_raw):
if row_mask[i] != 0:
if gene_counts_masked is None:
gene_counts_masked = gene_counts[i][:]
gene_expr_masked = gene_expr[i][:]
else:
gene_counts_masked = np.vstack((gene_counts_masked,gene_counts[i][:]))
gene_expr_masked = np.vstack((gene_expr_masked,gene_expr[i][:]))
gene_info_masked.append(gene_info[i])
#
# calculate normalization factors
#
norm_factors = np.zeros(num_samples)
for i in range(num_samples):
norm_factors[i] = float(np.percentile(gene_counts_masked[:,i],75))
x = norm_factors.mean()
norm_factors /= x
#
# normalize masked counts
#
for i in range(num_samples):
gene_counts_masked[:,i] /= norm_factors[i]
#
# establish confidence interval bins
#
sys.stderr.write("> establishing confidence intervals...\n")
# we'll divide up the data starting from the 1st percentile
bin_bounds = np.percentile(gene_counts_masked,[1,99])
bin_bounds[0] = transform_count(bin_bounds[0])
bin_bounds[1] = transform_count(bin_bounds[1])
bin_range = bin_bounds[1] - bin_bounds[0]
bin_interval = bin_range/float(NUM_BINS)
bin_matrix = np.zeros(NUM_BINS*2).reshape((NUM_BINS,2))
for i in range(NUM_BINS):
if i == 0:
bin_matrix[i][0] = 0
else:
bin_matrix[i][0] = bin_bounds[0]+i*bin_interval
bin_matrix[i][1] = bin_bounds[0]+(i+1)*bin_interval
bin_matrix[NUM_BINS-1,1] = ceil(transform_count(gene_counts_masked.max()))
diff_list = [[] for i in range(NUM_BINS)]
fold_list = [[] for i in range(NUM_BINS)]
# generate distributions of absolute differences and absolute log2 fold changes
# for all pairwise comparisons
for i in range(num_masked):
for j in range(num_samples-1):
x1 = gene_counts_masked[i][j]
b1 = int(hash_bin_index(transform_count(x1),bin_interval,bin_bounds[0],NUM_BINS-1))
for k in range(j+1,num_samples):
x2 = gene_counts_masked[i][k]
b2 = int(hash_bin_index(transform_count(x2),bin_interval,bin_bounds[0],NUM_BINS-1))
if x1 > 0 or x2 > 0:
# difference
diff = abs(x1-x2)
diff_list[b1].append(diff)
if b1 != b2:
diff_list[b2].append(diff)
# fold change
fold = de_utils.log2_ratio(gene_counts_masked[i][j],gene_counts_masked[i][k])
if np.isfinite(fold):
fold = abs(fold)
fold_list[b1].append(fold)
if b1 != b2:
fold_list[b2].append(fold)
# lists are complete. now we can take the 95th percentile
diff_lim = np.zeros(NUM_BINS)
fold_lim = np.zeros(NUM_BINS)
for i in range(NUM_BINS):
if len(diff_list[i]) > 0:
if num_samples > 2:
temp = bs.bootstrap(diff_list[i],quar_stat_low)
diff_lim[i] = temp.mean
else:
temp = bs.bootstrap(diff_list[i],quar_stat_high)
diff_lim[i] = temp.mean
if len(fold_list[i]) > 0:
if num_samples > 2:
temp = bs.bootstrap(fold_list[i],quar_stat_low)
fold_lim[i] = temp.mean
else:
temp = bs.bootstrap(fold_list[i],quar_stat_high)
fold_lim[i] = temp.mean
# sys.stderr.write("{:f} {:f}\n".format(quar_stat_low(fold_list[i]),fold_lim[i]))
#
# do differential expression
#
sys.stderr.write("> running differential expression test...\n")
cond_fold = np.zeros(num_masked)
cond_sig = np.zeros(num_masked,dtype=int)
cond_a_mean = np.zeros(num_masked)
cond_b_mean = np.zeros(num_masked)
for i in range(num_masked):
# get mean count values
cond_a_mean[i] = gene_counts_masked[i,0:num_cond_1].mean()
cond_b_mean[i] = gene_counts_masked[i,num_cond_1:].mean()
b1 = int(hash_bin_index(transform_count(cond_a_mean[i]),bin_interval,bin_bounds[0],NUM_BINS-1))
b2 = int(hash_bin_index(transform_count(cond_b_mean[i]),bin_interval,bin_bounds[0],NUM_BINS-1))
# do fold change
cond_fold[i] = de_utils.log2_ratio(cond_a_mean[i],cond_b_mean[i])
# get difference
diff = abs(cond_b_mean[i]-cond_a_mean[i])
# is this change tolerated?
if np.isfinite(cond_fold[i]):
#
# fold change is finite
#
conf_bound_a = fold_lim[b1]
conf_bound_b = fold_lim[b2]
if abs(cond_fold[i]) > conf_bound_a and abs(cond_fold[i]) > conf_bound_b:
cond_sig[i] = 2
elif abs(cond_fold[i]) > conf_bound_a or abs(cond_fold[i]) > conf_bound_b:
cond_sig[i] = 1
else:
#
# fold change is infinite
#
conf_bound_a = diff_lim[b1]
conf_bound_b = diff_lim[b2]
if diff > conf_bound_a and diff > conf_bound_b:
cond_sig[i] = 2
elif diff > conf_bound_a or diff > conf_bound_b:
cond_sig[i] = 1
#
# print out results!
#
sys.stdout.write("gene_id\ttranscript_id\tlocation\t{:s}\t{:s}\tlog2FoldChange\tsig_score\tsig\n".format(condition_labels[0],condition_labels[1]))
for i in range(num_masked):
if (args.sig_only and cond_sig[i] > 0) or not args.sig_only:
lout = [
"\t".join(gene_info_masked[i]),
"{:0.4f}".format(cond_a_mean[i]),
"{:0.4f}".format(cond_b_mean[i]),
"{:0.4f}".format(cond_fold[i]),
"{:d}".format(cond_sig[i])]
if cond_sig[i] > 0:
lout.append("yes")
else:
lout.append("no")
sys.stdout.write("\t".join(lout) + "\n")
#
# condition_mask
# generates a set of transcript ids that will be kept for analysis. this
# function checks each sample grouped by condition to determin if a
# transcript has enough hits across samples per condition to be considered
# for analysis.
#
def condition_mask(data,num_cond_1,num_cond_2,k_factor):
mask_out = []
cond_1_values = []
cond_2_values = []
values = []
num_rows = np.shape(data)[0]
mask = np.zeros(num_rows)
j = 0
i = 0
k = 0
#
# loop through data
#
for i in range(num_rows):
#
# at each row we check counts across all samples
cond_1_count = 0
cond_2_count = 0
# condition 1
k = 0
j = 0
while k < num_cond_1:
if data[i][k+j] >= MIN_COUNT:
cond_1_count += 1
k += 1
# condition 2
k = 0
j = num_cond_1
while k < num_cond_2:
if data[i][k+j] >= MIN_COUNT:
cond_2_count += 1
k += 1
if cond_1_count >= num_cond_1*k_factor or cond_2_count >= num_cond_2*k_factor:
# passed, append key
mask[i] = 1
return mask
def transform_count(x):
temp = x+1
return log(temp)/log(10)
def hash_bin_index(x, iv, p25, max_bin):
bin_index = floor( (x-p25)/iv )
# check for bins beyond limit
if bin_index > max_bin:
return max_bin
if bin_index < 0:
return 0
return bin_index
def hash_bin_index_list(x, iv, p25, max_bin):
n = len(x)
bin_index = np.zeros(n,dtype=int)
i = 0
for i in range(n):
bin_index[i] = int(floor((x[i]-p25)/iv))
if bin_index[i] > max_bin:
bin_index[i] = max_bin
elif bin_index[i] < 0:
bin_index[i] = 0
return bin_index
def diff_stat(x1,x2):
return np.mean(x1)-np.mean(x2)
def quar_stat_low(x1):
return np.percentile(np.array(x1),95)
def quar_stat_high(x1):
return np.percentile(np.array(x1),97.5)
#
# entry point
#
if __name__ == "__main__":
sys.exit(main(args))