forked from argriffing/xgcode
/
20090821a.py
312 lines (289 loc) · 12.4 KB
/
20090821a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
"""Annotate a chromosome with MAP states for various prior autocorrelations.
This should use HMMs with various levels of 'stickiness' or 'autocorrelation'.
Technically, the transition matrix at each state has probability (1-2x/3) of
no transition in one step, and a probability of (x/3)
of each of the two possible transitions,
where x=(1/10)**alpha where alpha is the 'stickiness'
which is a nonnegative integer.
A stickiness of zero means that the position specific posterior distributions
do not reflect any prior belief of sequential dependence among hidden states.
Use a HMM with 3 hidden states.
A subset of genomic positions are annotated
with a posterior hidden state distribution
given observations at each position in the subset.
The 3x3 transition matrix is known,
and the emission distributions for each hidden state are known
but are somewhat complicated.
Columns of the output are defined as follows.
strain: a string defining the genetic line
chromosome: a string defining the name of the chromosome
offset: an integer offset in units of base pairs
A: the number of A reads aligned to the this position
C: the number of C reads aligned to the this position
G: the number of G reads aligned to the this position
T: the number of T reads aligned to the this position
gap: the number of gaps reads aligned to the this position
call_0: MAP state for no prior stickiness
hom_0: relative posterior belief of homozygosity
het_0: relative posterior belief of heterozygosity
bad_0: relative posterior belief of badness
call_1: MAP state for prior stickiness of 1
hom_1: relative posterior belief of homozygosity
het_1: relative posterior belief of heterozygosity
bad_1: relative posterior belief of badness
.
.
.
"""
from StringIO import StringIO
import time
import optparse
import sys
import numpy as np
from SnippetUtil import HandlingError
import Form
import FormOut
import Progress
import TransitionMatrix
import ReadCoverage
import FastHMM
import iterutils
import Util
import const
g_sample_data = const.read('20100730j')
class TimeoutError(Exception): pass
def get_form():
"""
@return: the body of a form
"""
form_objects = [
Form.Integer('good_coverage',
'expected read coverage of informative positions',
20, low=1, high=1000),
Form.Integer('bad_coverage',
'expected read coverage of overcovered positions',
100, low=1, high=1000),
Form.Integer('nstickinesses',
'use this many different levels of stickiness',
5, low=1, high=5),
Form.Float('randomization_rate',
'randomization probability per base call',
0.1, low_exclusive=0),
Form.MultiLine('input_text',
'calls per nt per base call per chromosome per strain',
g_sample_data)]
return form_objects
def get_form_out():
return FormOut.Csv('annotation')
def get_response_content(fs):
# allow only two seconds for web access, and don't use a progress bar
nseconds = 2
use_pbar = False
# get the lines from the multi-line input
lines = Util.get_stripped_lines(fs.input_text.splitlines())
# try to get the response
try:
response_text = process(lines,
fs.good_coverage, fs.bad_coverage,
fs.randomization_rate, fs.nstickinesses, nseconds, use_pbar)
except TimeoutError:
raise HandlingError('exceeded remote run time limit')
# return the response
return response_text + '\n'
class Chromosome:
"""
One of these exists for each chromosome in each strain.
This is the level at which the HMM applies.
"""
def __init__(self, strain_name, chromosome_name):
"""
@param strain_name: the name of the associated genetic strain
@param chromosome_name: the name of the chromosome
"""
try:
test_string = ','.join([strain_name, chromosome_name])
except ValueError as e:
raise ValueError('value error for names %s and %s: %s' % (strain_name, chromosome_name, e))
# initialize some states
self.strain_name = strain_name
self.chromosome_name = chromosome_name
# initialize stuff
self.offsets = []
self.nt_coverages = []
self.posterior_distribution_lists = []
self.stickinesses = []
def add_position(self, offset, coverages):
"""
Add a position to the chromosome.
@param offset: the nucleotide offset
@param coverages: the (A, C, G, T, gap) coverage counts
"""
if len(coverages) != 5:
raise ValueError('five coverage counts were expected')
self.offsets.append(offset)
self.nt_coverages.append(coverages)
def annotate_posteriors(self, stickiness, hidden_models):
"""
@param stickiness: a nonnegative integer that defines the transition matrix
@param hidden_models: a list of statistical models
"""
# define the transition matrix
nhidden = len(hidden_models)
prandom = .1**stickiness
transition_object = TransitionMatrix.UniformTransitionObject(prandom, nhidden)
# define the HMM
cache_size = 10000
hmm = FastHMM.Model(transition_object, hidden_models, cache_size)
# define the observations and distances
observations = [tuple(sorted(coverage[:-1])) for coverage in self.nt_coverages]
distances = [b - a for a, b in iterutils.pairwise(self.offsets)]
# do the annotation
dp_info = hmm.get_dp_info(observations, distances)
distribution_list = hmm.scaled_posterior_durbin(dp_info)
# store the annotation with its respective stickiness
self.posterior_distribution_lists.append(distribution_list)
self.stickinesses.append(stickiness)
def get_rows_of_strings(self, model_names):
"""
@param model_names: a conformant list of strings for annotation
@return: a list of lists of strings
"""
rows = []
for i, offset in enumerate(self.offsets):
row = []
# append a fixed number of columns of non-annotation
row.extend([self.strain_name, self.chromosome_name, str(offset)])
row.extend([str(x) for x in self.nt_coverages[i]])
# append a variable number columns of annotation
for stickinesses, posterior_distributions in zip(self.stickinesses, self.posterior_distribution_lists):
# identify the position specific posterior distribution for this stickiness
distribution = posterior_distributions[i]
# append the name of the MAP state defined by the distribution
p, name = max(zip(distribution, model_names))
row.append(name)
# append the distribution itself
row.extend([str(x) for x in distribution])
rows.append(row)
return rows
def parse(input_lines):
"""
@param input_lines: raw input lines of the csv file
@return: a list of chromosome objects
"""
# validate the number of lines
if len(input_lines) < 2:
raise ValueError('there should be at least two lines of input')
# break the lines of input into rows of elements
input_rows = [line_to_row(line) for line in input_lines]
# validate the columns of data
ncolumns_expected = 8
ncolumns = len(input_rows[0])
if ncolumns != ncolumns_expected:
raise ValueError('expected %d columns of input' % ncolumns_expected)
for row in input_rows:
if len(row) != ncolumns:
raise ValueError('each row of input should have the same number of elements as the first row')
# process the non-header rows
chromosome_dict = {}
for row in input_rows[1:]:
process_genomic_position(row, chromosome_dict)
# turn the dictionary of chromosomes into a somewhat ordered list
chromosomes = [chromosome for identifier, chromosome in sorted(chromosome_dict.items())]
# return the list of chromosomes
return chromosomes
def process_genomic_position(row, chromosome_dict):
"""
The input data represents a single genomic position.
Each chromosome object is accessible by its (strain_name, chromosome_name) pair.
@param row: a data row from the csv file
@param chromosome_dict: a dictionary of chromosome objects
"""
# unpack the row
strain_name, chromosome_name = row[:2]
genomic_position = int(row[2])
coverages = [int(x) for x in row[3:]]
# get or create the relevant chromosome object
identifier = (strain_name, chromosome_name)
if identifier in chromosome_dict:
chromosome = chromosome_dict[identifier]
else:
chromosome = Chromosome(*identifier)
chromosome_dict[identifier] = chromosome
# add the observation to the chromosome
chromosome.add_position(genomic_position, coverages)
def line_to_row(line):
"""
@param line: a line in the csv file
@return: a list of comma separated elements
"""
return [x.strip() for x in line.split(',')]
def process(input_lines, good_coverage, bad_coverage, randomization_rate, nstickinesses, nseconds, use_pbar):
"""
@param input_lines: lines of input of csv data including the header
@param good_coverage: the expected number of reads at informative positions
@param bad_coverage: the expected number of reads at uninformative positions
@param randomization_rate: the probability of an error per read
@param nstickinesses: use this many different levels of stickiness
@param nseconds: None or impose a time limit of this many seconds
@param use_pbar: True iff a progress bar should be used
@return: the multi-line string of the resulting csv file
"""
# do some initialization
out = StringIO()
pbar = None
start_time = time.time()
# define the three models
homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
models = [homozygous, heterozygous, overcovered]
model_names = ['hom', 'het', 'ovr']
# read the chromosome data
chromosomes = parse(input_lines)
# write the header line
header_row = []
header_row.extend([
'genetic_line', 'chromosome', 'position',
'A_count', 'C_count', 'G_count', 'T_count', 'gap_count'])
for stickiness in range(nstickinesses):
for name in ('call', 'hom', 'het', 'bad'):
header_row.append('%s_%d' % (name, stickiness))
print >> out, ','.join(header_row)
# prepare to annotate the chromosomes
if use_pbar:
count = 0
pbar = Progress.Bar(len(chromosomes)*nstickinesses)
# annotate the chromosomes using the models
for i, chromosome in enumerate(chromosomes):
for stickiness in range(nstickinesses):
if nseconds and time.time() - start_time > nseconds:
raise TimeoutError()
chromosome.annotate_posteriors(stickiness, models)
if pbar:
count += 1
pbar.update(count)
print >> out, '\n'.join(','.join(row) for row in chromosome.get_rows_of_strings(model_names))
if pbar:
pbar.finish()
# return the output text
return out.getvalue().strip()
def main(options):
# validate the options
assert 1 <= options.good_coverage
assert 1 <= options.bad_coverage
assert 0 < options.randomization_rate <= 1
# read from standard input
lines = Util.get_stripped_lines(sys.stdin)
# show the result
use_pbar = True
nseconds = None
print process(lines, options.good_coverage, options.bad_coverage, options.randomization_rate, options.nstickinesses, nseconds, use_pbar)
if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser()
parser.add_option('--nstickinesses', dest='nstickinesses', type='int', default=5, help='use this many different levels of stickiness')
parser.add_option('--good-coverage', dest='good_coverage', type='int', default=20, help='expected read coverage of informative positions')
parser.add_option('--bad-coverage', dest='bad_coverage', type='int', default=100, help='expected read coverage of overcovered positions')
parser.add_option('--randomization-rate', dest='randomization_rate', type='float', default=0.1, help='randomization probability per read')
options, args = parser.parse_args()
main(options)