return False
    return True

results_dir = sys.argv[1]
suffix = sys.argv[2]
elmfile = sys.argv[3]

elms = {}
with open(elmfile) as f:
    for line in f:
        elm, exp = line.strip().split('\t')
        elms[elm] = True

for elm in elms:
    counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES,
                                      False, {},
                                      results_dir, {elm:True}, suffix)
    all_elmSeqs = {}
    for host in counts:
        for elmSeq in counts[host]:
            all_elmSeqs[elmSeq] = True
    host_vecs = utils.mk_count_vecs(counts, all_elmSeqs)
    host_dists = utils.mk_count_dists(host_vecs)
    js = defaultdict(dict)
    for host1, host2 in itertools.combinations(host_dists, 2):
        dis = utils.jensen_shannon_dists(host_dists[host1],
                                         host_dists[host2])
        js[host1][host2] = dis
        js[host2][host1] = dis
    if check_phylogeny(js):
        print elm
Exemple #2
0
    use_seqs = {}
    for elmSeq in seen_seqs[flu]:
        elm, seq = elmSeq.split(':')
        if elm == use_elm:
            if flu_counts[flu][elmSeq] > 0:
                use_seqs[elmSeq] = True
    seen_seqs_ls.append(use_seqs)

# remove seqs seen less than 10x
#for flu in flu_counts:
#    for elmSeq in 

use_seqs_pre = utils_graph.unionLists(seen_seqs_ls)

counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'),
                                  False, {},
                                  'working/Jun29/', {use_elm:True},
                                  '.init')

use_seqs = utils_graph.unionLists([use_seqs_pre, counts['Gallus_gallus'],
                                       counts['H_sapiens']])
host_vecs = utils.mk_count_vecs(counts, use_seqs)
host_dists = utils.mk_count_dists(host_vecs)
flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)  
flu_dists = utils.mk_count_dists(flu_vecs)

# for flu in flu_dists:
#     print sum(flu_dists[flu])
# sys.exit(0)

for seq, hhuman, hchicken, hf, cf in zip(use_seqs, host_dists['Gallus_gallus'],
                                     host_dists['H_sapiens'],
        (elm, stuff) = line.strip().split('\t')
        use_elms[elm] = True

do_clustering = True
if distance_file == 'NA':
    do_clustering = False

if do_clustering:
    dis_file = os.path.join(results_dir, distance_file)
    mapping = utils.get_clusters(dis_file, dis_cutoff_init,
                                 dis_cutoff_meta)
else:
    mapping = {}
    
counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES,
                                  do_clustering, mapping,
                                  results_dir, use_elms, suffix)

ls = []
for host in counts:
    ls.append(counts[host])
all_elmSeqs = {}
#all_elmSeqs = utils_graph.intersectLists(ls)
for host in counts:
    for elmSeq in counts[host]:
        all_elmSeqs[elmSeq] = True

host_vecs = utils.mk_count_vecs(counts, all_elmSeqs)
host_dists = utils.mk_count_dists(host_vecs)
utils_plot.phylogeny_js(os.path.join(results_dir,
                                     out_file), host_dists)
seen_seqs_ls = []
elm2seqs = defaultdict(dict)
flus = ("human", "chicken")
for flu in flus:
    # flu_elm_file = os.path.join('results',
    #                             flu + '.H5N1.elms')
    if "human" in flu:
        flu_elm_file = os.path.join("working/Jul1_year", flu + ".H3N2.2008.elms")
    else:
        flu_elm_file = os.path.join("working/Jul1_year/", flu + ".H5N1.2006.elms")
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, {}, False)
    for elmseq in seen_seqs[flu]:
        elm, seq = elmseq.split(":")
        elm2seqs[elm][elmseq] = True

counts = utils.count_host_elmSeqs(("Gallus_gallus", "H_sapiens"), False, {}, "working/Jun29/", working_elms, ".init")

for elm in working_elms:
    use_seqs = elm2seqs[elm]
    host_vecs = utils.mk_count_vecs(counts, use_seqs)
    host_dists = utils.mk_count_dists(host_vecs)
    flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)
    flu_dists = utils.mk_count_dists(flu_vecs)

    flu = flu_dists["human"]
    human_score_H = utils.jensen_shannon_dists(host_dists["H_sapiens"], flu)
    chicken_score_H = utils.jensen_shannon_dists(host_dists["Gallus_gallus"], flu)

    flu = flu_dists["chicken"]
    human_score_C = utils.jensen_shannon_dists(host_dists["H_sapiens"], flu)
    chicken_score_C = utils.jensen_shannon_dists(host_dists["Gallus_gallus"], flu)
flu_counts = {}
seen_seqs = {}
seen_seqs_ls = []

for flu in flus:
    flu_elm_file = os.path.join('results/',
                                flu + '.H5N1.elms')
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts,
                            seen_seqs, mapping, do_clustering)
    seen_seqs_ls.append(seen_seqs[flu])
if len(seen_seqs_ls) > 1:
    all_elmSeqs = utils_graph.unionLists(seen_seqs_ls)
else:
    all_elmSeqs = seen_seqs_ls[0]

host_counts = utils.count_host_elmSeqs(hosts, do_clustering, mapping,
                                       elm_count_dir)
                                                   
host_found_seqs = utils_graph.unionLists([host_counts['H_sapiens'],
                                          host_counts['Gallus_gallus']])
use_seqs = utils_graph.unionLists([all_elmSeqs, host_found_seqs])

flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)                   
host_vecs = utils.mk_count_vecs(host_counts, use_seqs)
host_dists = utils.mk_count_dists(host_vecs)
flu_dists = utils.mk_count_dists(flu_vecs)

js_distances = defaultdict(dict)
for host in ('H_sapiens', 'Gallus_gallus'):
    for flu in flus:
        js_dis = utils.jensen_shannon_dists(host_dists[host],
                                            flu_dists[flu])