-
Notifications
You must be signed in to change notification settings - Fork 0
/
neandertal.py
184 lines (151 loc) · 5.12 KB
/
neandertal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import sys
import subprocess
import genotype_tools
import mysql.connector
DB_INFO = {
"host": "marlowe",
"user": "gene210-user",
"passwd": "genomics",
"buffered": True
}
# Valid population identifiers.
POPULATIONS = ["CEU", "YRI", "JPT", "CHB"]
POPULATION_CHOICES = ", ".join(POPULATIONS)
SNPS_PATH = "data/neandertal-SNPs.csv"
BASE_MAP = {
"A": "T",
"T": "A",
"C": "G",
"G": "C"
}
##
# Fetch information from the database for the user's value for a neandertal
# SNP.
def get_snp_info(nsnp, flipped=False):
# Score is the amount by which we increment the numerator and denominator of
# the neandertal score.
snp_info = {"score": (0, 0)}
query = "SELECT * FROM diseases.neandertal_snps WHERE rsid = %s;"
db.execute(query, (nsnp.rsid,))
res = db.fetchone()
# db.description is a list of tuples; the first element of each tuple is the
# field name.
for i, attribute_info in enumerate(db.description):
snp_info[attribute_info[0]] = res[i]
snp_info["user_alleles"] = nsnp.genotype
heterozygote = snp_info["out_of_africa"] + snp_info["ancestral"]
# Homozygous homo neanderthalensis.
if nsnp.genotype == 2 * snp_info["out_of_africa"]:
snp_info["score"] = (2, 2)
# Homozygous homo sapiens.
elif nsnp.genotype == 2 * snp_info["ancestral"]:
snp_info["score"] = (0, 2)
# Otherwise, we're a heterozygote.
elif sorted(nsnp.genotype) == sorted(heterozygote):
snp_info["score"] = (1, 2)
else:
if not flipped:
nsnp.genotype = "".join([BASE_MAP[a] for a in nsnp.genotype])
get_snp_info(nsnp, True)
else:
# Hmmm... make no adjustment to the running numerator or denominator for
# this SNP, we couldn't match alleles. This ought never happen.
print "Error occurred matching user SNP to",
print "neandertal SNP %s!" % (nsnp.rsid,)
return snp_info
##
# Point of entry is here!
if len(sys.argv) not in (2, 3):
print "Usage: python neandertal.py path/to/user/genome/file.txt",
print "[population identifier] (if population not given, you will be",
print "prompted for its value."
sys.exit(2)
population = None
if len(sys.argv) == 2:
population = raw_input("Select a population (%s): " % POPULATION_CHOICES)
population = population.strip().upper()
if len(sys.argv) == 3:
population = sys.argv[2]
if population not in POPULATIONS:
print "Population identifier '%s' is not valid." % (population, ),
print "Valid population identifiers are one of (%s)." % (POPULATION_CHOICES,)
sys.exit(2)
user_genome_path = sys.argv[1]
if not os.path.exists(user_genome_path):
print "No file found at '%s'." % (user_genome_path)
sys.exit(2)
print "Loading genotype file '%s'... " % (user_genome_path,),
sys.stdout.flush()
user_snps = genotype_tools.FileUtils.read_genotype_file(user_genome_path)
print "done!"
##
# Calculation begins at this point.
results = {}
database = mysql.connector.Connect(**DB_INFO)
db = database.cursor()
db.execute("SELECT rsid FROM diseases.neandertal_snps;")
for nrsid in [row[0] for row in db.fetchall()]:
user_nsnp = user_snps.get(nrsid, None)
# We could not get this SNP directly: impute.
if user_nsnp is None:
user_nsnp = genotype_tools.impute_rsid_simple(user_snps, nrsid, population)
print "Imputed %s -> %s." % (user_nsnp.nearest_SNP, user_nsnp.rsid,)
snp_info = get_snp_info(user_nsnp)
# If we imputed, user_nsnp.rsid is the imputed rsid, and nearest_SNP is the
# rsid of the SNP we were imputing from (imputing for).
if user_nsnp.nearest_SNP is not None:
snp_info["imputed_from"] = user_nsnp.nearest_SNP
results[user_nsnp.rsid] = snp_info
nnumerator = sum(a["score"][0] for a in results.values())
ndenominator = sum(a["score"][1] for a in results.values())
# Name the output HTML file based on the input genome file"s name.
filename = os.path.splitext(os.path.split(user_genome_path)[1])[0]
out_file_name = "%s.html" % (filename,)
out_file = open(out_file_name, "w")
out_file.write("""
<html>
<head>
<title>
Are You a Caveman?
</title>
</head>
<body>
<h3 style="text-align:left;">
Caveman Index:
</h3>
Genome file: %s<br>
Population given was: %s<br>
Alleles shared with a caveman: %d / %d (%0.4f%%)<br><br>
""" % (filename, population, nnumerator, ndenominator,
100 * nnumerator / float(ndenominator)))
out_file.write("""
<table border="1">
<tr>
<th>rsID</th>
<th>Caveman Allele</th>
<th>Ancestral Allele</th>
<th>User Alleles</th>
<th>Imputed From</th>
</tr>
""")
##
# We just reserve the right to sort later, by changing the key function to
# something meaningful (this code was extensively copied and pasted).
for (rsid, snp_info) in sorted(
results.items(), key=lambda v: v, reverse=True
):
out_file.write("""
<tr>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
""" % (rsid, snp_info["out_of_africa"],
snp_info["ancestral"], "/".join(snp_info["user_alleles"])))
if "imputed_from" in snp_info:
out_file.write("<td>%s</td>" % (snp_info["imputed_from"]))
out_file.write("</td>")
out_file.write("</table></body></html>")
out_file.close()
subprocess.Popen(("open", out_file_name)).wait()