forked from argriffing/xgcode
/
F84.py
234 lines (212 loc) · 8.67 KB
/
F84.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
"""Functions dealing with the F84 nucleotide rate matrix.
The F84 evolutionary model is defined in the paper
"Maximum Likelihood Phylogenetic Estimation from DNA Sequences with Variable Rates over Sites: Approximate Methods"
by Ziheng Yang in J Mol Evol 1994.
"""
from StringIO import StringIO
import unittest
import math
import scipy.optimize
import MatrixUtil
import RateMatrix
import Fasta
import PairLikelihood
g_transitions = (('C', 'T'), ('T', 'C'), ('A', 'G'), ('G', 'A'))
def create_rate_matrix(kappa, nt_distribution):
"""
@param kappa: adjusts for the transition rate differing from the transversion rate
@param nt_distribution: ordered ACGT nucleotide probabilities
@return: a rate matrix object with one expected nucleotide substitution per time unit
"""
# make some assertions about the distribution
for p in nt_distribution:
assert p >= 0
assert len(nt_distribution) == 4
assert RateMatrix.almost_equal(sum(nt_distribution), 1.0)
# define some intermediate variables
A, C, G, T = nt_distribution
R = float(A + G)
Y = float(C + T)
# make some more assertions about the distribution and about kappa
assert A+G > 0
assert C+T > 0
assert kappa > max(-Y, -R)
# get the normalization constant
normalization_constant = 4*T*C*(1 + kappa/Y) + 4*A*G*(1 + kappa/R) + 4*Y*R
# adjust the normalization constant to correct what might be an error in the paper
normalization_constant /= 2
# define the dictionary rate matrix
dict_rate_matrix = {}
for source_index, source in enumerate('ACGT'):
for sink_index, sink in enumerate('ACGT'):
key = (source, sink)
coefficient = 1.0
if key in g_transitions:
coefficient = 1 + kappa / (nt_distribution[source_index] + nt_distribution[sink_index])
dict_rate_matrix[key] = coefficient * nt_distribution[sink_index] / normalization_constant
for source in 'ACGT':
dict_rate_matrix[(source, source)] = -sum(dict_rate_matrix[(source, sink)] for sink in 'ACGT' if source != sink)
# convert the dictionary rate matrix to a row major rate matrix
row_major = MatrixUtil.dict_to_row_major(dict_rate_matrix, 'ACGT', 'ACGT')
# return the rate matrix object
rate_matrix_object = RateMatrix.RateMatrix(row_major, 'ACGT')
expected_rate = rate_matrix_object.get_expected_rate()
if not RateMatrix.almost_equal(expected_rate, 1.0):
assert False, 'the rate is %f but should be 1.0' % expected_rate
return rate_matrix_object
def get_closed_form_estimates(sequence_pair):
"""
This will fail if some of the nucleotides are unobserved.
This formula is from Ziheng Yang JME 1994.
@param sequence_pair: a pair of upper case nucleotide sequences
@return: (distance, kappa, A, C, G, T)
"""
# assert that the input is reasonable
assert len(sequence_pair) == 2
# convert the generic sequences to strings
sa = ''.join(list(sequence_pair[0])).upper()
sb = ''.join(list(sequence_pair[1])).upper()
st = sa+sb
# assert that the sequences are long enough and are the same length
assert len(sa) > 1
assert len(sa) == len(sb)
# assert that the strings are composed of valid nucleotides
assert set(st) <= set('ACGT')
# get the estimated nucleotide distribution
n = len(sa)
assert len(st) == 2*n
A = st.count('A') / float(2*n)
C = st.count('C') / float(2*n)
G = st.count('G') / float(2*n)
T = st.count('T') / float(2*n)
# get some intermediate variables
R = A + G
Y = C + T
if not R or not Y:
assert False, 'expected some purines and some pyrimidines'
# get the fraction of sites with transitional and with transversional differences
P = 0
Q = 0
for a, b in zip(sa, sb):
key = (a, b)
if a == b:
continue
elif key in g_transitions:
P += 1
else:
Q += 1
P /= float(n)
Q /= float(n)
# get some more intermediate variables
A_num = 2*(T*C + A*G) + 2*(T*C*R/Y + A*G*Y/R) * (1 - Q/(2*Y*R)) - P
A_den = 2*(T*C/Y + A*G/R)
A_total = A_num / A_den
B_total = 1 - Q/(2*Y*R)
if A_total <= 0 or B_total <= 0:
assert False, 'range error in an intermediate calculation'
alpha = -math.log(A_total)/2
beta = -math.log(B_total)/2
# estimate kappa
kappa = alpha / beta - 1
# estimate the distance
distance = beta * (4*T*C*(1+kappa/Y) + 4*A*G*(1+kappa/R) + 4*Y*R)
# return the estimates
return (distance, kappa, A, C, G, T)
def parameters_to_distribution(parameters):
"""
@param parameters: these N-1 degrees of freedom define a distribution over N states
"""
den = 1.0 + sum(math.exp(p) for p in parameters)
dist = [1.0 / den]
for p in parameters:
dist.append(math.exp(p) / den)
return dist
class Objective:
"""
This class defines a function object used for testing maximum likelihood estimation.
This is currently used only for testing.
An instance will be called using scipy.optimize.fmin(...).
"""
def __init__(self, sequence_pair):
"""
@param sequence_pair: the observed data
"""
self.sequence_pair = sequence_pair
def get_initial_parameters(self):
"""
@return: a vector of parameters
"""
# get an initial guess
distance, kappa, A, C, G, T = get_closed_form_estimates(self.sequence_pair)
wC, wG, wT = (0, 0, 0)
# perturb the initial guess just to show that it is arbitrary
perturbation = .01
theta = distance + perturbation, kappa + perturbation, wC, wG, wT
# return the initial guess
return theta
def __call__(self, theta):
"""
@param theta: the vector of estimated parameters
@return: the negative log likelihood to be minimized
"""
# unpack the parameters
distance, kappa, wC, wG, wT = theta
nt_distribution = parameters_to_distribution((wC, wG, wT))
# make the rate matrix
model = create_rate_matrix(kappa, nt_distribution)
# get the likelihood
log_likelihood = PairLikelihood.get_log_likelihood(distance, self.sequence_pair, model)
return -log_likelihood
class TestF84(unittest.TestCase):
def testMLE(self):
"""
Assert that there is no syntax error in the MLE estimation code.
"""
sa = 'AAACG'
sb = 'TTACG'
pair = (sa, sb)
distance, kappa, A, C, G, T = get_closed_form_estimates(pair)
def testCreate(self):
"""
Assert that there is no syntax error in the matrix creation code.
"""
nt_distribution = (.25, .25, .25, .25)
kappa = 1.0
rate_matrix_object = create_rate_matrix(kappa, nt_distribution)
def testNormalization(self):
"""
Assert that the matrix is properly normalized.
"""
nt_distribution = (.1, .2, .3, .4)
kappa = 2.0
rate_matrix_object = create_rate_matrix(kappa, nt_distribution)
self.assertAlmostEqual(1.0, rate_matrix_object.get_expected_rate())
def testLikelihood(self):
"""
Assert that no errors occur during the analysis
"""
# define a simple (but not completely degenerate) alignment
sa = 'AAAACCCCGGGGTTAA'
sb = 'GAAACCTCGGCGTAAA'
sequence_pair = (sa, sb)
# get estimates according to an analytical formula which is not necessarily the mle
distance_mle, kappa_mle, A_mle, C_mle, G_mle, T_mle = get_closed_form_estimates((sa, sb))
nt_distribution_mle = (A_mle, C_mle, G_mle, T_mle)
rate_matrix_object = create_rate_matrix(kappa_mle, nt_distribution_mle)
log_likelihood_mle = PairLikelihood.get_log_likelihood(distance_mle, sequence_pair, rate_matrix_object)
# get the maximum likelihood estimates according to a numeric optimizer.
f = Objective((sa, sb))
values = list(f.get_initial_parameters())
result = scipy.optimize.fmin(f, values, ftol=.0000000001, disp=0)
distance_opt, kappa_opt, wC_opt, wG_opt, wT_opt = result
nt_distribution_opt = parameters_to_distribution((wC_opt, wG_opt, wT_opt))
rate_matrix_object = create_rate_matrix(kappa_opt, nt_distribution_opt)
log_likelihood_opt = PairLikelihood.get_log_likelihood(distance_opt, sequence_pair, rate_matrix_object)
if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser()
parser.add_option('--test', action='store_true', dest='test', default=False)
options, args = parser.parse_args()
if options.test:
suite = unittest.TestLoader().loadTestsFromTestCase(TestF84)
unittest.TextTestRunner(verbosity=2).run(suite)