forked from jeffhussmann/ribosomes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
geometric_model.py
340 lines (275 loc) · 15.1 KB
/
geometric_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import numpy as np
import matplotlib.pyplot as plt
import Serialize
import positions
import brewer2mpl
from scipy.optimize import leastsq
from itertools import cycle
import h5py
import select_work
from Sequencing import utilities
def smoothed(array, window_size):
smoothed_array = np.zeros_like(array)
for i in range(window_size):
smoothed_array[i] = array[:i + 1].sum() / float(i + 1)
for i in range(window_size, len(array) - window_size):
smoothed_array[i] = array[i - window_size:i + window_size + 1].sum() / float(2 * window_size + 1)
for i in range(len(array) - window_size, len(array)):
smoothed_array[i] = array[i:].sum() / float(len(array) - i)
return smoothed_array
# Generators that yields arrays of counts
def counts_from_read_positions_fn(read_positions_fn, key='all'):
hdf5_file = h5py.File(read_positions_fn, 'r')
progress = utilities.progress_bar(len(hdf5_file), hdf5_file)
for gene_name in progress:
#if gene_name == 'YLR256W':
# continue
#if gene_name in {'YLR249W', 'YPL106C', 'YGL008C'}:
# continue
if key == 'nonzero':
gene = Serialize.read_positions.build_gene(hdf5_file[gene_name], specific_keys={'all', '0'})
nonzero_counts = gene['all'] - gene[0]
yield gene_name, nonzero_counts
else:
gene = Serialize.read_positions.build_gene(hdf5_file[gene_name], specific_keys={str(key)})
counts = gene[key]
yield gene_name, counts
def plot_mRNA_metagene_unaveraged(from_end, min_length, max_length):
bmap = brewer2mpl.get_map('Set1', 'qualitative', 9)
colors = cycle(bmap.mpl_colors[:5] + bmap.mpl_colors[6:])
experiments = select_work.build_all_experiments(verbose=False)
mRNA_experiments = [#('WT_mRNA_1', 'polyA', 0, experiments['belgium_2014_12_10']['WT_1_mRNA']),
#('WT_mRNA_1', 'polyA', 'nonzero', experiments['belgium_2014_12_10']['WT_1_mRNA']),
#('WT_mRNA_1', 'stop_codon', 0, experiments['belgium_2014_12_10']['WT_1_mRNA']),
#('WT_mRNA_1', 'stop_codon', 'nonzero', experiments['belgium_2014_12_10']['WT_1_mRNA']),
#('WT_mRNA_1', 'start', 'all', experiments['belgium_2014_12_10']['WT_1_mRNA']),
#('WT_mRNA_1', 'start_codon', 'all', experiments['belgium_2014_12_10']['WT_1_mRNA']),
#('WT_cDNA_mRNA', 'cap', 'all', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']),
#('WT_cDNA_mRNA', 'start_codon', 'all', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']),
#('R98S_1_mRNA', 'cap', 'all', experiments['belgium_2014_12_10']['R98S_1_mRNA']),
#('R98S_1_mRNA', 'start_codon', 'all', experiments['belgium_2014_12_10']['R98S_1_mRNA']),
##('WT_mRNA_1 3\'', experiments['belgium_2014_12_10']['WT_1_mRNA']),
##('WT_mRNA_2 3\'', experiments['belgium_2014_12_10']['WT_2_mRNA']),
##('WT_cDNA_mRNA 3\'', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']),
#('RiboZero', 'polyA', 0, experiments['weinberg']['RiboZero']),
#('RiboZero', 'polyA', 'nonzero', experiments['weinberg']['RiboZero']),
('RiboZero', 'start', 'all', experiments['weinberg']['RiboZero']),
('RiboZero', 'start_codon', 'all', experiments['weinberg']['RiboZero']),
##('RiboZero', 'stop_codon', 0, experiments['weinberg']['RiboZero']),
##('RiboZero', 'stop_codon', 'nonzero', experiments['weinberg']['RiboZero']),
#('Dynabeads', 'polyA', 0, experiments['weinberg']['Dynabeads']),
#('Dynabeads', 'polyA', 'nonzero', experiments['weinberg']['Dynabeads']),
#('Dynabeads', 'cap', 'all', experiments['weinberg']['Dynabeads']),
#('Dynabeads', 'start_codon', 'all', experiments['weinberg']['Dynabeads']),
##('Dynabeads', 'stop_codon', 0, experiments['weinberg']['Dynabeads']),
##('Dynabeads', 'stop_codon', 'nonzero', experiments['weinberg']['Dynabeads']),
]
plot_to = 500
fig_cumulative, ax_cumulative = plt.subplots()
edge_buffer = 200
if from_end:
xs = np.arange(-plot_to, edge_buffer)
else:
xs = np.arange(-edge_buffer, plot_to)
unexpected_counts = {}
for (name, landmark, key, experiment), color in zip(mRNA_experiments, colors):
print name, landmark, key
if from_end:
counts_generator = counts_from_read_positions_fn(experiment.file_names['three_prime_read_positions'], key=key)
else:
counts_generator = counts_from_read_positions_fn(experiment.file_names['read_positions'], key='all')
landmarks = {'start': 0, 'start_codon': 0, 'stop_codon': 90000, 'end': 90000}
expected_counts = positions.PositionCounts(landmarks, 400, 400, dtype=float)
actual_counts = positions.PositionCounts(landmarks, 400, 400, dtype=float)
for gene_name, counts in counts_generator:
if not min_length <= counts.CDS_length <= max_length:
continue
num_positions = counts.CDS_length + edge_buffer
if from_end:
edge_slice = (landmark, slice(-counts.CDS_length, edge_buffer))
else:
edge_slice = (landmark, slice(-edge_buffer, counts.CDS_length))
unexpected_slice = (landmark, slice(-edge_buffer, 0))
r_g = counts[edge_slice].sum()
uniform_counts = np.ones(num_positions) * r_g / num_positions
actual_counts[edge_slice] += counts[edge_slice]
expected_counts[edge_slice] += uniform_counts
unexpected_counts[gene_name] = counts[unexpected_slice].sum()
print actual_counts.sum()
print expected_counts.sum()
most_unexpected = sorted(unexpected_counts, key=unexpected_counts.get, reverse=True)
for n in most_unexpected[:10]:
print n, unexpected_counts[n]
if from_end:
plot_slice = (landmark, slice(-plot_to, edge_buffer))
else:
plot_slice = ('start_codon', slice(-edge_buffer, plot_to))
ax_cumulative.plot(xs, expected_counts[plot_slice], '--', color=color)
ax_cumulative.plot(xs, actual_counts[plot_slice], 'o-', color=color, markersize=2, markeredgewidth=0, label='{0}, {1}, {2}, actual'.format(name, landmark, key))
#ax_cumulative.plot(xs, smoothed(actual_counts[-49:plot_to], 15) / expected_counts[0], '-', label='{0}'.format(name), color=color)
#ax_cumulative.set_ylim(0.8, 1.5)
#ax_cumulative.plot(xs, np.zeros(plot_to), 'k--')
ax_cumulative.legend(loc='upper left', framealpha=0.5)
if from_end:
xlabel = 'Position relative to {0}'.format(landmark)
else:
xlabel = 'Position relative to start of CDS'
ax_cumulative.set_xlabel(xlabel)
ax_cumulative.set_xlim(min(xs), max(xs))
ax_cumulative.set_ylabel('Mapped read counts, normalized across data sets')
#ax_cumulative.set_title('Read counts in the final {0} bases of CDSs at least {0} long'.format(min_length))
fig_cumulative.set_size_inches(18, 12)
#plt.savefig('mRNA_bias_comparison_4.png')
#plt.savefig('mRNA_bias_comparison.pdf')
def end_bias(from_end, min_length, max_length):
# Generators that yields arrays of counts
def counts_from_read_positions_fn(read_positions_fn, from_end):
gene_infos = Serialize.read_file(read_positions_fn, 'read_positions')
for gene_name in gene_infos:
#if gene_name == 'YLR256W':
# continue
if from_end:
counts = gene_infos[gene_name]['all'].relative_to_end
else:
counts = gene_infos[gene_name]['all']
yield gene_name, counts
experiment = ('Weinberg_mRNA', '/home/jah/projects/arlen/experiments/weinberg/mRNA/results/mRNA_read_positions.txt')
#experiment = ('test', '/home/jah/projects/arlen/experiments/weinberg/mRNA/results/test_read_positions.txt')
plot_to = max_length
xs = np.arange(-49, plot_to)
if from_end:
xs = -xs
name, fn = experiment
excess_list = []
bigs = []
for gene_name, counts in counts_from_read_positions_fn(fn, from_end):
if counts.extent_length < min_length:
continue
num_positions = counts.extent_length
edge_slice = slice(0, counts.extent_length)
r_g = counts[:counts.extent_length].sum()
uniform_counts = np.ones(num_positions) * r_g / num_positions
excess = (counts[:200] - uniform_counts[:200]).sum()
excess_list.append(excess)
if excess > 500:
print gene_name, excess
bigs.append(counts[:counts.extent_length])
return excess_list, bigs
def make_L_distribution(l_g, p):
l = np.arange(l_g + 1)
L_distribution = (1 - p)**l * p
L_distribution[l_g] = (1 - p)**l_g
return L_distribution
def make_P_distribution(l_g, p):
l = np.arange(l_g + 1)
uniform_factor = np.zeros(l_g + 1)
uniform_factor[1:] = 1. / l[1:]
L_distribution = make_L_distribution(l_g, p)
L_distribution = L_distribution / L_distribution[1:].sum()
sum_terms = uniform_factor * L_distribution
P_distribution = sum_terms
for x in range(l_g - 1, 0, -1):
P_distribution[x] += P_distribution[x + 1]
return P_distribution
edge_overlap = 50
def counts_from_genes(genes):
for gene_name in sorted(genes):
CDS_slice = slice(2 * edge_overlap, 2 * edge_overlap + genes[gene_name]['CDS_length'])
counts = sum(genes[gene_name]['position_counts'][length][CDS_slice]
for length in genes[gene_name]['position_counts'])
yield counts
def get_actual_counts(genes, min_length):
total_actual_counts = np.zeros(100000)
for counts in counts_from_genes(genes):
l_g = len(counts)
if l_g < min_length:
continue
counts = counts[::-1]
total_actual_counts[:l_g] += counts
return total_actual_counts
def get_geometric_counts(genes, p, min_length):
total_geometric_counts = np.zeros(100000)
for counts in counts_from_genes(genes):
l_g = len(counts)
if l_g < min_length:
continue
r_g = counts.sum()
# Some weirdness about whether to include 0 or not in make_P_distribution
geometric_counts = make_P_distribution(l_g, p)[1:] * r_g
total_geometric_counts[:l_g] += geometric_counts
return total_geometric_counts
def residuals(p, genes, min_length):
print 'testing', p
err = get_actual_counts(genes, min_length) - get_geometric_counts(genes, p, min_length)
return err
def fit_p():
read_positions_fn = '/home/jah/projects/arlen/experiments/ingolia_science/mRNA-rich-1/results/mRNA-rich-1_read_positions.txt'
#read_positions_fn = '/home/jah/projects/arlen/experiments/ingolia_science/mRNA-rich-2/results/mRNA-rich-2_read_positions.txt'
#read_positions_fn = '/home/jah/projects/arlen/experiments/nagalakshmi_science/RH_ori/results/RH_ori_read_positions.txt'
genes = Serialize.read_file(read_positions_fn, 'read_positions')
min_length = 5000
p_lsq = leastsq(residuals, 7.8e-5, args=(genes, min_length), full_output=True)
print p_lsq
return p_lsq
def total_counts_given_p():
mRNA_experiments = [
('Ingolia_mRNA_1', '/home/jah/projects/arlen/experiments/ingolia_science/mRNA-rich-1/results/mRNA-rich-1_read_positions.txt'),
#('Ingolia_mRNA_2', '/home/jah/projects/arlen/experiments/ingolia_science/mRNA-rich-2/results/mRNA-rich-2_read_positions.txt'),
#('Nagalakshmi_RH_ori', '/home/jah/projects/arlen/experiments/nagalakshmi_science/RH_ori/results/RH_ori_read_positions.txt'),
#('Nagalakshmi_RH_bio', '/home/jah/projects/arlen/experiments/nagalakshmi_science/RH_bio/results/RH_bio_read_positions.txt'),
#('Nagalakshmi_dT_ori', '/home/jah/projects/arlen/experiments/nagalakshmi_science/dT_ori/results/dT_ori_read_positions.txt'),
#('Nagalakshmi_dT_bio', '/home/jah/projects/arlen/experiments/nagalakshmi_science/dT_bio/results/dT_bio_read_positions.txt'),
]
experiments = [(name, counts_from_genes(Serialize.read_file(fn, 'read_positions')))
for name, fn in mRNA_experiments]
min_length = 0000
max_length = 10000
plot_to = 2000
fig_cumulative, ax_cumulative = plt.subplots()
xs = np.arange(0, -plot_to, -1)
for name, counts_generator in experiments:
print name
total_uniform_counts = np.zeros(100000)
total_geometric_counts = np.zeros(100000)
total_actual_counts = np.zeros(100000)
for counts in counts_generator:
l_g = len(counts)
if not (min_length < l_g < max_length):
continue
r_g = counts.sum()
uniform_counts = np.ones(l_g) / l_g * r_g
# Some weirdness about whether to include 0 or not in make_P_distribution
#geometric_counts = make_P_distribution(l_g, 4e-4)[1:] * r_g
geometric_counts = make_P_distribution(l_g, 7.8e-5)[1:] * r_g
#geometric_counts = make_P_distribution(l_g, 4.6e-5)[1:] * r_g
counts = counts[::-1]
total_actual_counts[:l_g] += counts
total_uniform_counts[:l_g] += uniform_counts
total_geometric_counts[:l_g] += geometric_counts
print total_actual_counts.sum()
print total_uniform_counts.sum()
print total_geometric_counts.sum()
ax_cumulative.plot(xs, smoothed(total_actual_counts[:plot_to], 5), '-', label=name + '_actual_smoothed')
ax_cumulative.plot(xs, total_uniform_counts[:plot_to], '-', linewidth=2, label=name + '_uniform')
ax_cumulative.plot(xs, total_geometric_counts[:plot_to], '-', linewidth=2, label=name + '_geometric')
ax_cumulative.legend()
xlabel = 'Position relative to end'
ax_cumulative.set_xlabel(xlabel)
ax_cumulative.set_ylabel('Mapped read counts')
#ax_cumulative.set_title('Read counts in the final {0} bases of CDSs at least {0} long'.format(min_length))
#if __name__ == '__main__':
# read_positions_fn = '/home/jah/projects/arlen/experiments/ingolia_science/mRNA-rich-1/results/mRNA-rich-1_read_positions.txt'
# genes = Serialize.read_file(read_positions_fn, 'read_positions')
# tail_counts = {}
# for n in [100]:
# tail_counts[n] = [sum(counts[-n:]) for counts in counts_from_genes(genes)
# if sum(counts) > 100 and len(counts) > 100 and sum(counts[-n:]) > 0]
# read_density = [sum(counts) / float(len(counts)) for counts in counts_from_genes(genes)
# if sum(counts) > 100 and len(counts) > 100 and sum(counts[-n:]) > 0]
#
# ratios = [t / r_d for t, r_d in zip(tail_counts[100], read_density)]
# lengths = [len(counts) for counts in counts_from_genes(genes)
# if sum(counts) > 100 and len(counts) > 100 and sum(counts[-n:]) > 0]
#
# log_lengths = np.log10(lengths)
# log_ratios = np.log10(ratios)