Ejemplo n.º 1
0
""" What is the Jensen-shannon distance
    between 2 flu groups?
"""
import utils, sys, utils_graph

flu_group_1_file = sys.argv[1]
flu_group_2_file = sys.argv[2]

flu_counts = {}
seen_elmSeqs = {}
seen_seqs_ls = []
for name, file in (('g1', flu_group_1_file),
                   ('g2', flu_group_2_file)):
    utils.count_flu_sampled(name, file,
                            flu_counts,
                            seen_elmSeqs, {}, False)
    seen_seqs_ls.append(seen_elmSeqs[name])
use_elmSeqs = utils_graph.unionLists(seen_seqs_ls)
flu_vecs = utils.mk_count_vecs(flu_counts, use_elmSeqs)
flu_dists = utils.mk_count_dists(flu_vecs)
js_dis = utils.jensen_shannon_dists(flu_dists['g1'],
                                    flu_dists['g2'])
print js_dis
    

Ejemplo n.º 2
0
use_elm = sys.argv[1]

flu_counts = {}
seen_seqs = {}
seen_seqs_ls = []
flus = ('human','chicken')
for flu in flus:
    if 'human' in flu:
        flu_elm_file = os.path.join('working/Jul1_year',
                                    flu + '.H3N2.2008.elms')
    else:
        flu_elm_file = os.path.join('working/Jul1_year/',
                                    flu + '.H5N1.2006.elms')
    # flu_elm_file = os.path.join('working/Jun30/',
    #                             flu + '.H5N1.simpleELMs')
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts,
                            seen_seqs, {}, False)
    use_seqs = {}
    for elmSeq in seen_seqs[flu]:
        elm, seq = elmSeq.split(':')
        if elm == use_elm:
            if flu_counts[flu][elmSeq] > 0:
                use_seqs[elmSeq] = True
    seen_seqs_ls.append(use_seqs)

# remove seqs seen less than 10x
#for flu in flu_counts:
#    for elmSeq in 

use_seqs_pre = utils_graph.unionLists(seen_seqs_ls)

counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'),
if do_clustering:
    f = os.path.join(elm_count_dir, cluster_distance_file)
    mapping = utils.get_clusters(f, 2.5, float(2.5))    
else:
    mapping = {}
hosts = global_settings.TEST_GENOMES
#all_elmSeqs = {}
flus = ('human',)
flu_counts = {}
seen_seqs = {}
seen_seqs_ls = []

for flu in flus:
    flu_elm_file = os.path.join('results/',
                                flu + '.H5N1.elms')
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts,
                            seen_seqs, mapping, do_clustering)
    seen_seqs_ls.append(seen_seqs[flu])
if len(seen_seqs_ls) > 1:
    all_elmSeqs = utils_graph.unionLists(seen_seqs_ls)
else:
    all_elmSeqs = seen_seqs_ls[0]

host_counts = utils.count_host_elmSeqs(hosts, do_clustering, mapping,
                                       elm_count_dir)
                                                   
host_found_seqs = utils_graph.unionLists([host_counts['H_sapiens'],
                                          host_counts['Gallus_gallus']])
use_seqs = utils_graph.unionLists([all_elmSeqs, host_found_seqs])

flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)                   
host_vecs = utils.mk_count_vecs(host_counts, use_seqs)