def getDistance(virus_d, host_d): seqs = utils_graph.unionLists([virus_d, host_d]) #virus_d_rn = renorm(seqs, virus_d) #host_d_rn = renorm(seqs, host_d) host_v = [] virus_v = [] for seq in seqs: for v,d in ( (host_v, host_d), (virus_v, virus_d) ): if seq in d: v.append(d[seq]) else: v.append(float(0)) # host_norm = norm(host_v) # virus_norm = norm(virus_v) # if host_norm: # host_u = host_v/host_norm # else: # host_u = host_v # if virus_norm: # virus_u = virus_v/virus_norm # else: # virus_u = virus_v dis = distance.cosine(host_v, virus_v) #print dis, virus_v, host_v return dis
def getDistance(virus_d, host_d): seqs = utils_graph.unionLists([virus_d, host_d]) host_v = [] virus_v = [] for seq in seqs: for v,d in ( (host_v, host_d), (virus_v, virus_d) ): if seq in d: v.append(d[seq]) else: v.append(0) return distance.cosine(host_v, virus_v)
# flu + '.H5N1.simpleELMs') utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, {}, False) use_seqs = {} for elmSeq in seen_seqs[flu]: elm, seq = elmSeq.split(':') if elm == use_elm: if flu_counts[flu][elmSeq] > 0: use_seqs[elmSeq] = True seen_seqs_ls.append(use_seqs) # remove seqs seen less than 10x #for flu in flu_counts: # for elmSeq in use_seqs_pre = utils_graph.unionLists(seen_seqs_ls) counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'), False, {}, 'working/Jun29/', {use_elm:True}, '.init') use_seqs = utils_graph.unionLists([use_seqs_pre, counts['Gallus_gallus'], counts['H_sapiens']]) host_vecs = utils.mk_count_vecs(counts, use_seqs) host_dists = utils.mk_count_dists(host_vecs) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) flu_dists = utils.mk_count_dists(flu_vecs) # for flu in flu_dists: # print sum(flu_dists[flu])
for host in species: host2elmFreqs[host] = utils.get_seq2count_dict(os.path.join(local_settings.RESULTSDIR, 'elmdict_' + host + suffix), float(0)) tmp_input = 'plots/for_aydin_2/cos_host_host' + suffix + '.tab' with open(tmp_input, 'w') as f: f.write('Host1\tHost2\tDistance\n') for i in xrange(len(species)): for j in xrange(len(species)): if i != j: host1 = species[i] host2 = species[j] sum = float(0) elms_compared = 0 for elm in utils_graph.unionLists([host2elmFreqs[host1], host2elmFreqs[host2]]): for h in (host1, host2): if elm not in host2elmFreqs[h]: host2elmFreqs[h][elm] = {} if len(host2elmFreqs[host1][elm].keys()) != 0 and len(host2elmFreqs[host2][elm].keys()) != 0: sum += utils.getDistance(host2elmFreqs[host1][elm], host2elmFreqs[host2][elm]) elms_compared += 1 f.write('%s\t%s\t%.10f\n' % (short_names[host1], short_names[host2], float(sum)/float(elms_compared))) out_file = 'plots/for_aydin_2/cos_dis_sum_host' + suffix + '.png' tmp_r = 'tmp_r' + str(random.randint(0,100)) with open(tmp_r, 'w') as f:
common_mammal_controled) utils_graph.dumpNodes('mammal_controled' + str(cut), use_mammal_controled) utils_graph.dumpNodes('bird_controled' + str(cut), use_bird_controled) utils_graph.dumpNodes('common_controled' + str(cut), common_all_elms_controled) for k in use_mammal_controled.keys(): if 'FAIL' in k: del use_mammal_controled[k] for k in use_bird_controled.keys(): if 'FAIL' in k: del use_bird_controled[k] for k in common_all_elms_controled.keys(): if 'FAIL' in k: del common_all_elms_controled[k] test_elms = utils_graph.unionLists([use_mammal, use_bird]) virus_elms_same = 0 virus_elm_count = 0 non_virus_elms_same = 0 with open('mammal_bird.different.' + str(cut) + '.test', 'w') as f: for elm in test_elms: if elm not in ignore_elms: count,same = test_it(elm, elm2freq, f) virus_elms_same += same virus_elm_count += count non_virus_elms = 0 non_virus_elms_all = 0 control_elms = {} with open('mammal_bird.different.' + str(cut) + '.notest', 'w') as f: for elm in elm2freq:
""" What is the Jensen-shannon distance between 2 flu groups? """ import utils, sys, utils_graph flu_group_1_file = sys.argv[1] flu_group_2_file = sys.argv[2] flu_counts = {} seen_elmSeqs = {} seen_seqs_ls = [] for name, file in (('g1', flu_group_1_file), ('g2', flu_group_2_file)): utils.count_flu_sampled(name, file, flu_counts, seen_elmSeqs, {}, False) seen_seqs_ls.append(seen_elmSeqs[name]) use_elmSeqs = utils_graph.unionLists(seen_seqs_ls) flu_vecs = utils.mk_count_vecs(flu_counts, use_elmSeqs) flu_dists = utils.mk_count_dists(flu_vecs) js_dis = utils.jensen_shannon_dists(flu_dists['g1'], flu_dists['g2']) print js_dis
for hp in h2_noRestrictions.keys(): for hp_neigh in network[hp].keys(): for cd in matching_cds.keys(): if human_cd2protein[cd].has_key(hp_neigh): h1[hp_neigh] = True h2[hp] = True h1_to_h2[hp_neigh + ":" + hp] = True for vp in virus_elm2protein[elm].keys(): for pred in h1.keys(): if version2geneid.has_key(pred): f.write(vp + "\t" + elm + "\t" + version2geneid[pred] + "\th1\n") for pred in h2.keys(): if version2geneid.has_key(pred): f.write(vp + "\t" + elm + "\t" + version2geneid[pred] + "\th2\n") for pred in utils_graph.unionLists([h1, h2]).keys(): if version2geneid.has_key(pred): f.write(vp + "\t" + elm + "\t" + version2geneid[pred] + "\th1h2\n") for pred in h2_noRestrictions.keys(): if version2geneid.has_key(pred): f.write(vp + "\t" + elm + "\t" + version2geneid[pred] + "\th2All\n") for pair in h1_to_h2.keys(): if not vp_to_h1_to_h2.has_key(vp): vp_to_h1_to_h2[vp] = {} vp_to_h1_to_h2[vp][pair] = True with open(outf2, "w") as f: for vp in vp_to_h1_to_h2.keys(): for pair in vp_to_h1_to_h2[vp]: [h1_gene, h2_gene] = pair.split(":") if version2geneid.has_key(h1_gene) and version2geneid.has_key(h2_gene):
mapping = {} hosts = global_settings.TEST_GENOMES #all_elmSeqs = {} flus = ('human',) flu_counts = {} seen_seqs = {} seen_seqs_ls = [] for flu in flus: flu_elm_file = os.path.join('results/', flu + '.H5N1.elms') utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, mapping, do_clustering) seen_seqs_ls.append(seen_seqs[flu]) if len(seen_seqs_ls) > 1: all_elmSeqs = utils_graph.unionLists(seen_seqs_ls) else: all_elmSeqs = seen_seqs_ls[0] host_counts = utils.count_host_elmSeqs(hosts, do_clustering, mapping, elm_count_dir) host_found_seqs = utils_graph.unionLists([host_counts['H_sapiens'], host_counts['Gallus_gallus']]) use_seqs = utils_graph.unionLists([all_elmSeqs, host_found_seqs]) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) host_vecs = utils.mk_count_vecs(host_counts, use_seqs) host_dists = utils.mk_count_dists(host_vecs) flu_dists = utils.mk_count_dists(flu_vecs)