/
longevity.py
266 lines (213 loc) · 7.74 KB
/
longevity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import cgi
import sys
import os
import genotype_tools
import subprocess
import urllib
import snp
import mysql
# Complementarity map.
BASE_MAP = {
"A": "T",
"T": "A",
"C": "G",
"G": "C"
}
# Return (f_cases_*, f_controls_*) for this SNP.
def get_snp_info(el_snp, flipped=False):
# Index of each frequency column in rows returned from the database, e.g.
# f_cases_aa is at 12 and f_controls_aa is at 15.
attribute_index_map = {
"aa": [12, 15],
"ab": [13, 16],
"bb": [14, 17],
"bayes_factor": [9]
}
snp_info = {}
# Query params are %s with this library, not ?.
query = "SELECT * FROM diseases.el_snps WHERE db_snp = %s;"
db.execute(query, (el_snp.rsid[2:],))
res = db.fetchone()
snp_info["el_alleles"] = res[2].split("/")
for k, vs in attribute_index_map.items():
snp_info[k] = [res[v] for v in vs]
if len(snp_info[k]) == 1:
snp_info[k] = snp_info[k][0]
# Get information about this SNP from the paper"s supplementals.
snp_info["user_alleles"] = el_snp.genotype
# We have two of the "AA" alleles.
if el_snp.genotype == 2 * snp_info["el_alleles"][0]:
snp_info["probabilities"] = snp_info["aa"]
# We have one of each type of allele.
elif sorted(el_snp.genotype) == sorted(snp_info["el_alleles"]):
snp_info["probabilities"] = snp_info["ab"]
# We have two of the "BB" alleles.
elif el_snp.genotype == 2 * snp_info["el_alleles"][1]:
snp_info["probabilities"] = snp_info["bb"]
else:
# If we haven"t flipped the bases of the user"s EL SNP, and we haven"t
# found a match yet, flip now and call this function again.
if not flipped:
el_snp.genotype = "".join([BASE_MAP[a] for a in el_snp.genotype])
snp_info = get_snp_info(el_snp, True)
else:
print "Error occurred matching user SNP to EL SNP %s!" % (el_snp.rsid,)
snp_info["probabilities"] = (1, 1)
el_top_frequency = 2 * snp_info["aa"][0] + snp_info["ab"][0]
al_top_frequency = 2 * snp_info["aa"][1] + snp_info["ab"][1]
if el_top_frequency >= al_top_frequency:
snp_info["centenarian_allele"] = snp_info["el_alleles"][0]
else:
snp_info["centenarian_allele"] = snp_info["el_alleles"][1]
return snp_info
GOOGLE_CHARTS_BASE_URL = "http://chart.apis.google.com/chart"
GOOGLE_CHART_PARAMS = {
"cht": "lc",
"chs": "630x240",
"chtt": "Probability of Exceptional Longevity",
"chma": "15,5,5,5",
"chxt": "x,x,y,y",
"chxr": "0,1,150|2,0,1",
"chxl": "1:|Number of SNPs|3:|Probability of EL|",
"chxp": "1,50|3,50",
}
def build_chart_url(el_running_odds):
odds_series = ",".join(str(round(i, 4)) for i in el_running_odds)
GOOGLE_CHART_PARAMS["chd"] = "t:%s" % (odds_series,)
query_string = "&".join(
e[0] + "=" + e[1] for e in GOOGLE_CHART_PARAMS.items()
)
return GOOGLE_CHARTS_BASE_URL + "?" + query_string
database = mysql.connector.Connect(host="marlowe", user="gene210-user",
passwd="genomics", buffered=True)
db = database.cursor()
population = None
populations = ("CEU", "YRI")
population_choices = "(" + ", ".join(populations) + ")"
codename = None
if len(sys.argv) < 2:
print "Usage: python el_test.py path/to/user/genome/file.txt",
print "[population identifier] [codename]"
print "If the population identifier or codename are not given, you will be",
print "prompted for their values."
sys.exit(2)
if len(sys.argv) == 4:
population = sys.argv[2].strip().upper()
codename = sys.argv[3].strip()
if population is None:
population = raw_input("Select a population %s: " % population_choices)
population = population.strip().upper()
if population not in populations:
print "Population identifier %s is not valid." % (population,)
sys.exit(1)
if codename is None:
codename = raw_input("Select a codename: ").strip()
user_genome_path = sys.argv[1].strip()
if not os.path.exists(user_genome_path):
print "No file found at %s." % (user_genome_path,)
sys.exit(2)
print "Loading genotype file %s... " % (user_genome_path,),
sys.stdout.flush()
user_snps = genotype_tools.FileUtils.read_genotype_file(user_genome_path)
print("done!")
db.execute("SELECT db_snp FROM diseases.el_snps")
el_rsids = [row[0] for row in db.fetchall()]
# Now, we want to compute
# a = prior(EL) * product(i = 1 to 150, Pr(SNP-i | EL))
# b = prior(AL) * product(i = 1 to 150, Pr(SNP-i | AL))
# a / (a + b)
# To avoid underflow, we don"t do precisely this - instead, we start with an
# odds ratio of 1 (= 0.5 / 0.5, from the paper"s priors on Pr(EL)
# and Pr(AL)), and then adjust that each time by
# Pr(SNP-i | EL) / Pr(SNP-I | AL).
el_odds = 1
# Store the intermediate results for plotting, starting with 50%
el_running_odds = [50]
# For each EL-SNP, we get the value of that SNP from the provided genome
# and adjust the running EL and AL scores according to those values from the
# paper.
# Track all SNPs for output.
results = {}
for el_rsid in el_rsids:
# If we already have a value for this SNP - use it.
user_el_snp = user_snps.get("rs" + el_rsid, None)
# Impute if we have to...
if user_el_snp is None:
try:
user_el_snp = genotype_tools.impute_rsid_simple(
user_snps, "rs" + el_rsid, population
)
# TODO: workaround error with rs2042831 and CEU from Mikolaj Habryn.
except ValueError, e:
print "Error occurred imputing for rs%s: %s." % (el_rsid, e)
continue
# This SNP was imputed.
print "Imputed %s." % (user_el_snp.nearest_SNP,)
# Imputation returned None (this should never happen).
if user_el_snp is None:
print "Unable to impute value for rs%s: imputation returned None." % el_rsid
continue
# Update probabilities - multiply by the ratio of the probability of EL
# given this genotype / the probability of AL given this genotype.
snp_info = get_snp_info(user_el_snp)
if user_el_snp.nearest_SNP is not None:
snp_info["imputed_from"] = user_el_snp.nearest_SNP
el_odds *= (snp_info["probabilities"][0] / snp_info["probabilities"][1])
# Store the running total for display later
el_running_odds.append((100 * (el_odds / (1 + el_odds))))
results[user_el_snp.rsid] = snp_info
el_probability = 100 * (el_odds / (1 + el_odds))
print "Percentage chance of living to 100: %f" % (el_probability,)
db.execute("""
INSERT INTO diseases.class_el_probabilities
(codename, population, probability)
VALUES ("%s", "%s", %f);
""" % (codename, population, el_probability))
# Name the output HTML file based on the input genome file"s name.
filename = os.path.splitext(os.path.split(user_genome_path)[1])[0]
out_file_name = "%s.html" % (filename,)
out_file = open(out_file_name, "w")
out_file.write("""
<html>
<head>
<title>
Exceptional Longevity Exercise
</title>
</head>
<body>
<h3 style="text-align:left;">
Estimated Probability of Exceptional Longevity
</h3>
Genome file: %s<br>
Population given was: %s<br>
Estimated probability of living > 100 years: %.4f%%<br><br>
""" % (filename, population, el_probability))
chart_url = build_chart_url(el_running_odds)
out_file.write("<img src='%s'></img><br><br>" % (chart_url,))
out_file.write("""
<table>
<tr>
<th>rsID</th>
<th>Centenarian Alleles</th>
<th>Favorable Allele</th>
<th>Bayes Factor</th>
<th>User Alleles</th>
<th>Imputed From</th>
</tr>
""")
for (rsid, snp) in results.items():
out_file.write("""
<tr>
<td>rs%s</td>
<td>%s</td>
<td>%s</td>
<td>%1.2f</td>
<td>%s</td>
""" % (rsid, "".join(snp["el_alleles"]), snp["centenarian_allele"],
snp["bayes_factor"], snp["user_alleles"]))
if "imputed_from" in snp:
out_file.write("<td>%s</td>" % (snp["imputed_from"]))
out_file.write("</td>")
out_file.write("</table></body></html>")
out_file.close()
subprocess.Popen(("open", out_file_name)).wait()