return False return True results_dir = sys.argv[1] suffix = sys.argv[2] elmfile = sys.argv[3] elms = {} with open(elmfile) as f: for line in f: elm, exp = line.strip().split('\t') elms[elm] = True for elm in elms: counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES, False, {}, results_dir, {elm:True}, suffix) all_elmSeqs = {} for host in counts: for elmSeq in counts[host]: all_elmSeqs[elmSeq] = True host_vecs = utils.mk_count_vecs(counts, all_elmSeqs) host_dists = utils.mk_count_dists(host_vecs) js = defaultdict(dict) for host1, host2 in itertools.combinations(host_dists, 2): dis = utils.jensen_shannon_dists(host_dists[host1], host_dists[host2]) js[host1][host2] = dis js[host2][host1] = dis if check_phylogeny(js): print elm
use_seqs = {} for elmSeq in seen_seqs[flu]: elm, seq = elmSeq.split(':') if elm == use_elm: if flu_counts[flu][elmSeq] > 0: use_seqs[elmSeq] = True seen_seqs_ls.append(use_seqs) # remove seqs seen less than 10x #for flu in flu_counts: # for elmSeq in use_seqs_pre = utils_graph.unionLists(seen_seqs_ls) counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'), False, {}, 'working/Jun29/', {use_elm:True}, '.init') use_seqs = utils_graph.unionLists([use_seqs_pre, counts['Gallus_gallus'], counts['H_sapiens']]) host_vecs = utils.mk_count_vecs(counts, use_seqs) host_dists = utils.mk_count_dists(host_vecs) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) flu_dists = utils.mk_count_dists(flu_vecs) # for flu in flu_dists: # print sum(flu_dists[flu]) # sys.exit(0) for seq, hhuman, hchicken, hf, cf in zip(use_seqs, host_dists['Gallus_gallus'], host_dists['H_sapiens'],
(elm, stuff) = line.strip().split('\t') use_elms[elm] = True do_clustering = True if distance_file == 'NA': do_clustering = False if do_clustering: dis_file = os.path.join(results_dir, distance_file) mapping = utils.get_clusters(dis_file, dis_cutoff_init, dis_cutoff_meta) else: mapping = {} counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES, do_clustering, mapping, results_dir, use_elms, suffix) ls = [] for host in counts: ls.append(counts[host]) all_elmSeqs = {} #all_elmSeqs = utils_graph.intersectLists(ls) for host in counts: for elmSeq in counts[host]: all_elmSeqs[elmSeq] = True host_vecs = utils.mk_count_vecs(counts, all_elmSeqs) host_dists = utils.mk_count_dists(host_vecs) utils_plot.phylogeny_js(os.path.join(results_dir, out_file), host_dists)
seen_seqs_ls = [] elm2seqs = defaultdict(dict) flus = ("human", "chicken") for flu in flus: # flu_elm_file = os.path.join('results', # flu + '.H5N1.elms') if "human" in flu: flu_elm_file = os.path.join("working/Jul1_year", flu + ".H3N2.2008.elms") else: flu_elm_file = os.path.join("working/Jul1_year/", flu + ".H5N1.2006.elms") utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, {}, False) for elmseq in seen_seqs[flu]: elm, seq = elmseq.split(":") elm2seqs[elm][elmseq] = True counts = utils.count_host_elmSeqs(("Gallus_gallus", "H_sapiens"), False, {}, "working/Jun29/", working_elms, ".init") for elm in working_elms: use_seqs = elm2seqs[elm] host_vecs = utils.mk_count_vecs(counts, use_seqs) host_dists = utils.mk_count_dists(host_vecs) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) flu_dists = utils.mk_count_dists(flu_vecs) flu = flu_dists["human"] human_score_H = utils.jensen_shannon_dists(host_dists["H_sapiens"], flu) chicken_score_H = utils.jensen_shannon_dists(host_dists["Gallus_gallus"], flu) flu = flu_dists["chicken"] human_score_C = utils.jensen_shannon_dists(host_dists["H_sapiens"], flu) chicken_score_C = utils.jensen_shannon_dists(host_dists["Gallus_gallus"], flu)
flu_counts = {} seen_seqs = {} seen_seqs_ls = [] for flu in flus: flu_elm_file = os.path.join('results/', flu + '.H5N1.elms') utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, mapping, do_clustering) seen_seqs_ls.append(seen_seqs[flu]) if len(seen_seqs_ls) > 1: all_elmSeqs = utils_graph.unionLists(seen_seqs_ls) else: all_elmSeqs = seen_seqs_ls[0] host_counts = utils.count_host_elmSeqs(hosts, do_clustering, mapping, elm_count_dir) host_found_seqs = utils_graph.unionLists([host_counts['H_sapiens'], host_counts['Gallus_gallus']]) use_seqs = utils_graph.unionLists([all_elmSeqs, host_found_seqs]) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) host_vecs = utils.mk_count_vecs(host_counts, use_seqs) host_dists = utils.mk_count_dists(host_vecs) flu_dists = utils.mk_count_dists(flu_vecs) js_distances = defaultdict(dict) for host in ('H_sapiens', 'Gallus_gallus'): for flu in flus: js_dis = utils.jensen_shannon_dists(host_dists[host], flu_dists[flu])