/
test_host_virus_hypothesis.py
58 lines (50 loc) · 2.16 KB
/
test_host_virus_hypothesis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
""" For the input ELMs, take the JS
divergence for chicken/human H5N1
to human & chicken. For which ELMs
does the hypothesis holds. Sample equally
from flus to avoid biases. """
import sys, utils, os, utils_graph
from collections import defaultdict
elm_file = sys.argv[1]
working_elms = utils_graph.getNodes(elm_file)
flu_counts = {}
seen_seqs = {}
seen_seqs_ls = []
elm2seqs = defaultdict(dict)
flus = ('human', 'chicken')
for flu in flus:
# flu_elm_file = os.path.join('results',
# flu + '.H5N1.elms')
if 'human' in flu:
flu_elm_file = os.path.join('working/Jul1_year',
flu + '.H3N2.2008.elms')
else:
flu_elm_file = os.path.join('working/Jul1_year/',
flu + '.H5N1.2006.elms')
utils.count_flu_sampled(flu, flu_elm_file, flu_counts,
seen_seqs, {}, False)
for elmseq in seen_seqs[flu]:
elm, seq = elmseq.split(':')
elm2seqs[elm][elmseq] = True
counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'),
False, {},
'working/Jun29/', working_elms,
'.init')
for elm in working_elms:
use_seqs = elm2seqs[elm]
host_vecs = utils.mk_count_vecs(counts, use_seqs)
host_dists = utils.mk_count_dists(host_vecs)
flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)
flu_dists = utils.mk_count_dists(flu_vecs)
flu = flu_dists['human']
human_score_H = utils.jensen_shannon_dists(host_dists['H_sapiens'],
flu)
chicken_score_H = utils.jensen_shannon_dists(host_dists['Gallus_gallus'],
flu)
flu = flu_dists['chicken']
human_score_C = utils.jensen_shannon_dists(host_dists['H_sapiens'],
flu)
chicken_score_C = utils.jensen_shannon_dists(host_dists['Gallus_gallus'],
flu)
if human_score_C > chicken_score_C and human_score_H < chicken_score_H:
print elm