forked from argriffing/xgcode
-
Notifications
You must be signed in to change notification settings - Fork 0
/
20120621a.py
209 lines (196 loc) · 7.47 KB
/
20120621a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Sample a child chromosome given a parent population. [UNFINISHED]
The model is similar to that of Wright-Fisher
and has mutation, selection, and recombination.
Each of these effects is associated with a separate global parameter.
For the parental population field,
each row corresponds to a single chromosome
and each column corresponds to a position within the chromosome.
At every position in every chromosome
is either a low fitness or a high fitness allele,
denoted by 0 or 1 respectively.
"""
from StringIO import StringIO
from itertools import combinations, product
import numpy as np
import gmpy
import Form
import FormOut
import MatrixUtil
from smallutil import stripped_lines
import Util
import popgenmarkov
g_default_parental_state = """
1111
0000
""".strip()
def get_form():
form_objects = [
Form.Float('selection_param', 'multiplicative allele fitness',
'2.0', low_exclusive=0),
Form.Float('mutation_param', 'mutation randomization probability',
'0.0001', low_inclusive=0, high_inclusive=1),
Form.Float('recombination_param',
'linkage phase randomization probability',
'0.001', low_inclusive=0, high_inclusive=1),
Form.MultiLine('parental_state', 'parental population',
g_default_parental_state),
]
return form_objects
def get_form_out():
return FormOut.Report()
def multiline_state_to_ndarray(multiline_state):
arr = []
for line in stripped_lines(multiline_state.splitlines()):
row = []
for s in line:
v = int(s)
if v not in (0, 1):
raise ValueError('invalid allele')
row.append(v)
arr.append(row)
return np.array(arr)
def ndarray_to_multiline_state(K):
return '\n'.join(''.join(str(x) for x in r) for r in K)
def get_transition_matrix(
selection, mutation, recombination,
nchromosomes, npositions):
"""
This factors into the product of two transition matrices.
The first transition matrix accounts for selection and recombination.
The second transition matrix independently accounts for mutation.
@param selection: a fitness ratio
@param mutation: a state change probability
@param recombination: a linkage phase change probability
@param nchromosomes: number of chromosomes in the population
@param npositions: number of positions per chromosome
@return: a numpy array
"""
nstates = 1 << (nchromosomes * npositions)
# define the mutation transition matrix
P_mutation = np.zeros((nstates, nstates))
for source in range(nstates):
for sink in range(nstates):
ndiff = gmpy.hamdist(source, sink)
nsame = nchromosomes * npositions - ndiff
P_mutation[source, sink] = (mutation**ndiff)*((1-mutation)**nsame)
"""
print 'mutation transition matrix row sums:'
for value in np.sum(P_mutation, axis=1):
print value
print
"""
# init the unnormalized selection and recombination transition matrix
M = np.zeros((nstates, nstates))
for source_index in range(nstates):
K_source = popgenmarkov.int_to_bin2d(
source_index, nchromosomes, npositions)
chromosome_distn = popgenmarkov.get_chromosome_distn(
selection, recombination, K_source)
for x in product(range(1<<npositions), repeat=nchromosomes):
weight = 1
for index in x:
weight *= chromosome_distn[index]
sink_index = 0
for i, index in enumerate(x):
sink_index <<= npositions
sink_index |= index
M[source_index, sink_index] = weight
M_row_sums = np.sum(M, axis=1)
P_selection_recombination = (M.T / M_row_sums).T
"""
print 'selection-recombination matrix row sums:'
for value in np.sum(P_selection_recombination, axis=1):
print value
print
"""
# define the state transition probability matrix
P = np.dot(P_selection_recombination, P_mutation)
return P
def sample_endpoint_conditioned_path(
initial_state, final_state, path_length, P):
"""
Return a sequence of states.
The returned sequence starts at the initial state
and ends at the final state.
Consecutive states may be the same
if the transition matrix has positive diagonal elements.
This function is derived from an earlier function
in an old SamplePath module.
@param initial_state: the first state as a python integer
@param final_state: the last state as a python integer
@param path_length: the number of states in the returned path
@param P: ndarray state transition matrix
@return: list of integer states
"""
# get the size of the state space and do some input validation
MatrixUtil.assert_transition_matrix(P)
nstates = len(P)
if not (0 <= initial_state < nstates):
raise ValueError('invalid initial state')
if not (0 <= final_state < nstates):
raise ValueError('invalid final state')
# take care of edge cases
if path_length == 0:
return []
elif path_length == 1:
if initial_state != final_state:
raise ValueError('unequal states for a path of length one')
return [initial_state]
elif path_length == 2:
return [initial_state, final_state]
# create transition matrices raised to various powers
max_power = path_length - 2
matrix_powers = [1]
matrix_powers.append(P)
for i in range(2, max_power + 1):
matrix_powers.append(np.dot(matrix_powers[i-1], P))
# sample the path
path = [initial_state]
for i in range(1, path_length-1):
previous_state = path[i-1]
weight_state_pairs = []
for state_index in range(nstates):
weight = 1
weight *= P[previous_state, state_index]
weight *= matrix_powers[path_length - 1 - i][state_index, final_state]
weight_state_pairs.append((weight, state_index))
next_state = Util.weighted_choice(weight_state_pairs)
path.append(next_state)
path.append(final_state)
return path
def get_response_content(fs):
parental_state = multiline_state_to_ndarray(fs.parental_state)
nchromosomes, npositions = parental_state.shape
if nchromosomes * npositions > 10:
raise ValueError('at most 2^10 states are allowed')
#
mutation = 0.5 * fs.mutation_param
recombination = 0.5 * fs.recombination_param
selection = fs.selection_param
#
out = StringIO()
print >> out, 'number of chromosomes:'
print >> out, nchromosomes
print >> out
print >> out, 'number of positions per chromosome:'
print >> out, npositions
print >> out
# define the transition matrix
P = get_transition_matrix(
selection, mutation, recombination, nchromosomes, npositions)
# sample the endpoint conditioned path
initial_integer_state = popgenmarkov.bin2d_to_int(initial_state)
final_integer_state = popgenmarkov.bin2d_to_int(final_state)
path = sample_endpoint_conditioned_path(
initial_integer_state, final_integer_state,
fs.ngenerations, P)
print >> out, 'sampled endpoint conditioned path, including endpoints:'
print >> out
for integer_state in path:
# print integer_state
print >> out, ndarray_to_multiline_state(
popgenmarkov.int_to_bin2d(
integer_state, nchromosomes, npositions))
print >> out
return out.getvalue()