/
20090820a.py
283 lines (256 loc) · 10.6 KB
/
20090820a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""Annotate a single resequenced chromosome in a single genetic line.
Use a HMM with 3 hidden states.
A subset of genomic positions are annotated
with a posterior hidden state distribution
given observations at each position in the subset.
The 3x3 transition matrix is known,
and the emission distributions for each hidden state are known
but are somewhat complicated.
Columns of the output are defined as follows.
strain: a string defining the genetic line
chromosome: a string defining the name of the chromosome
offset: an integer offset in units of base pairs
A: an integer representing the number of A reads aligned to the this position
C: an integer representing the number of C reads aligned to the this position
G: an integer representing the number of G reads aligned to the this position
T: an integer representing the number of T reads aligned to the this position
gap: an integer representing the number of gaps reads
aligned to the this position
hom_ll: the floating point non-Markov log likelihood
that the position is homozygous
het_ll: the floating point non-Markov log likelihood
that the position is heterozygous
bad_ll: the floating point non-Markov log likelihood
that the position is bad
hom_post: the floating point posterior probability
that the position is homozygous
het_post: the floating point posterior probability
that the position is heterozygous
bad_post: the floating point posterior probability
that the position is bad
"""
from StringIO import StringIO
import time
import optparse
import sys
import numpy as np
from SnippetUtil import HandlingError
import Form
import FormOut
import Progress
import ReadCoverage
import FastHMM
import TransitionMatrix
import iterutils
class TimeoutError(Exception): pass
g_output_header_row = [
'genetic_line', 'chromosome', 'position',
'A_count', 'C_count', 'G_count', 'T_count', 'gap_count',
'homozygous_ll', 'heterozygous_ll', 'bad_ll',
'homozygous_posterior', 'heterozygous_posterior', 'bad_posterior']
def get_form():
"""
@return: the body of a form
"""
return []
def get_form_out():
return FormOut.Report()
def get_response_content(fs):
"""
@param fs: a FieldStorage object containing the cgi arguments
@return: a (response_headers, response_text) pair
"""
# allow only two seconds for web access, and don't use a progress bar
nseconds = 2
use_pbar = False
# try to get the response
try:
return 'this is not web-accessible\n'
except TimeoutError:
return 'exceeded remote run time limit\n'
class Chromosome:
"""
One of these exists for each chromosome in each strain.
This is the level at which the HMM applies.
"""
def __init__(self, strain_name, chromosome_name):
"""
@param strain_name: the name of the associated genetic strain
@param chromosome_name: the name of the chromosome
"""
try:
test_string = ','.join([strain_name, chromosome_name])
except ValueError as e:
raise ValueError('value error for names %s and %s: %s' % (strain_name, chromosome_name, e))
# initialize some states
self.strain_name = strain_name
self.chromosome_name = chromosome_name
# initialize stuff
self.offsets = []
self.nt_coverages = []
self.log_likelihoods = []
self.posterior_distributions = []
def add_position(self, offset, coverages):
"""
Add a position to the chromosome.
@param offset: the nucleotide offset
@param coverages: the (A, C, G, T, gap) coverage counts
"""
if len(coverages) != 5:
raise ValueError('five coverage counts were expected')
self.offsets.append(offset)
self.nt_coverages.append(coverages)
def annotate_likelihoods(self, hidden_models):
"""
@param hidden_models: a list of statistical models
"""
observations = [coverage[:-1] for coverage in self.nt_coverages]
self.log_likelihoods = [[m.get_log_likelihood(obs) for m in hidden_models] for obs in observations]
def annotate_posteriors(self, transition_object, hidden_models):
"""
@param transition_object: has transition matrix information
@param hidden_models: a list of statistical models
"""
# define the HMM
cache_size = 10000
hmm = FastHMM.Model(transition_object, hidden_models, cache_size)
# define the observations and distances
observations = [tuple(sorted(coverage[:-1])) for coverage in self.nt_coverages]
distances = [b - a for a, b in iterutils.pairwise(self.offsets)]
# do the annotation
dp_info = hmm.get_dp_info(observations, distances)
self.posterior_distributions = hmm.scaled_posterior_durbin(dp_info)
def get_rows_of_strings(self):
"""
@return: a list of lists of strings
"""
rows = []
for i, offset in enumerate(self.offsets):
row = [self.strain_name, self.chromosome_name, str(offset)]
row.extend([str(x) for x in self.nt_coverages[i]])
row.extend([str(x) for x in self.log_likelihoods[i]])
row.extend([str(x) for x in self.posterior_distributions[i]])
rows.append(row)
return rows
def parse(input_lines):
"""
@param input_lines: raw input lines of the csv file
@return: a list of chromosome objects
"""
# validate the number of lines
if len(input_lines) < 2:
raise ValueError('there should be at least two lines of input')
# break the lines of input into rows of elements
input_rows = [line_to_row(line) for line in input_lines]
# validate the columns of data
ncolumns_expected = 8
ncolumns = len(input_rows[0])
if ncolumns != ncolumns_expected:
raise ValueError('expected %d columns of input' % ncolumns_expected)
for row in input_rows:
if len(row) != ncolumns:
raise ValueError('each row of input should have the same number of elements as the first row')
# process the non-header rows
chromosome_dict = {}
for row in input_rows[1:]:
process_genomic_position(row, chromosome_dict)
# turn the dictionary of chromosomes into a somewhat ordered list
chromosomes = [chromosome for identifier, chromosome in sorted(chromosome_dict.items())]
# return the list of chromosomes
return chromosomes
def process_genomic_position(row, chromosome_dict):
"""
The input data represents a single genomic position.
Each chromosome object is accessible by its (strain_name, chromosome_name) pair.
@param row: a data row from the csv file
@param chromosome_dict: a dictionary of chromosome objects
"""
# unpack the row
strain_name, chromosome_name = row[:2]
genomic_position = int(row[2])
coverages = [int(x) for x in row[3:]]
# get or create the relevant chromosome object
identifier = (strain_name, chromosome_name)
if identifier in chromosome_dict:
chromosome = chromosome_dict[identifier]
else:
chromosome = Chromosome(*identifier)
chromosome_dict[identifier] = chromosome
# add the observation to the chromosome
chromosome.add_position(genomic_position, coverages)
def line_to_row(line):
"""
@param line: a line in the csv file
@return: a list of comma separated elements
"""
return [x.strip() for x in line.split(',')]
def process(input_lines, good_coverage, bad_coverage, randomization_rate, transition_object, nseconds, use_pbar):
"""
@param input_lines: lines of input of csv data including the header
@param good_coverage: the expected number of reads at informative positions
@param bad_coverage: the expected number of reads at uninformative positions
@param randomization_rate: the probability of an error per read
@param transition_object: has transition matrix information
@param nseconds: None or impose a time limit of this many seconds
@param use_pbar: True iff a progress bar should be used
@return: the multi-line string of the resulting csv file
"""
# do some initialization
out = StringIO()
pbar = None
start_time = time.time()
# define the three models
homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
models = [homozygous, heterozygous, overcovered]
# read the chromosome data
chromosomes = parse(input_lines)
# write the header line
print >> out, ','.join(g_output_header_row)
# prepare to annotate the chromosomes
if use_pbar:
pbar = Progress.Bar(len(chromosomes))
# annotate the chromosomes using the models
try:
for i, chromosome in enumerate(chromosomes):
if nseconds and time.time() - start_time > nseconds:
raise TimeoutError()
chromosome.annotate_likelihoods(models)
chromosome.annotate_posteriors(transition_object, models)
print >> out, '\n'.join(','.join(row) for row in chromosome.get_rows_of_strings())
if pbar:
pbar.update(i + 1)
except KeyboardInterrupt, e:
if pbar:
pbar.finish()
raise e
except TimeoutError as e:
if pbar:
pbar.finish()
raise e
# return the output text
return out.getvalue().strip()
def main(options):
# validate the options
assert 1 <= options.good_coverage
assert 1 <= options.bad_coverage
assert 0 < options.randomization_rate <= 1
# define an arbitrary transition matrix
transition_object = TransitionMatrix.UniformTransitionObject(.01, 3)
# read from standard input
lines = sys.stdin.readlines()
lines = [line.strip() for line in lines]
lines = [line for line in lines if line]
# show the result
use_pbar = True
nseconds = None
print process(lines, options.good_coverage, options.bad_coverage, options.randomization_rate, transition_object, nseconds, use_pbar)
if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser()
parser.add_option('--good-coverage', dest='good_coverage', type='int', default=20, help='expected read coverage of informative positions')
parser.add_option('--bad-coverage', dest='bad_coverage', type='int', default=100, help='expected read coverage of overcovered positions')
parser.add_option('--randomization-rate', dest='randomization_rate', type='float', default=0.1, help='randomization probability per read')
options, args = parser.parse_args()
main(options)