Example #1
0
def getDistance(virus_d, host_d):
    seqs = utils_graph.unionLists([virus_d,
                                   host_d])
    #virus_d_rn = renorm(seqs, virus_d)
    #host_d_rn = renorm(seqs, host_d)

    host_v = []
    virus_v = []
    for seq in seqs:
        for v,d in ( (host_v, host_d),
                     (virus_v, virus_d) ):
            if seq in d:
                v.append(d[seq])
            else:
                v.append(float(0))

    # host_norm = norm(host_v)
    # virus_norm = norm(virus_v)
    # if host_norm:
    #     host_u = host_v/host_norm
    # else:
    #     host_u = host_v
    # if virus_norm:
    #     virus_u = virus_v/virus_norm
    # else:
    #     virus_u = virus_v
    dis = distance.cosine(host_v, virus_v)
    #print dis, virus_v, host_v
    return dis
Example #2
0
def getDistance(virus_d, host_d):
    seqs = utils_graph.unionLists([virus_d,
                                   host_d])
    host_v = []
    virus_v = []
    for seq in seqs:
        for v,d in ( (host_v, host_d),
                     (virus_v, virus_d) ):
            if seq in d:
                v.append(d[seq])
            else:
                v.append(0)
    return distance.cosine(host_v, virus_v)
Example #3
0
    #                             flu + '.H5N1.simpleELMs')
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts,
                            seen_seqs, {}, False)
    use_seqs = {}
    for elmSeq in seen_seqs[flu]:
        elm, seq = elmSeq.split(':')
        if elm == use_elm:
            if flu_counts[flu][elmSeq] > 0:
                use_seqs[elmSeq] = True
    seen_seqs_ls.append(use_seqs)

# remove seqs seen less than 10x
#for flu in flu_counts:
#    for elmSeq in 

use_seqs_pre = utils_graph.unionLists(seen_seqs_ls)

counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'),
                                  False, {},
                                  'working/Jun29/', {use_elm:True},
                                  '.init')

use_seqs = utils_graph.unionLists([use_seqs_pre, counts['Gallus_gallus'],
                                       counts['H_sapiens']])
host_vecs = utils.mk_count_vecs(counts, use_seqs)
host_dists = utils.mk_count_dists(host_vecs)
flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)  
flu_dists = utils.mk_count_dists(flu_vecs)

# for flu in flu_dists:
#     print sum(flu_dists[flu])
Example #4
0
for host in species:
    host2elmFreqs[host] = utils.get_seq2count_dict(os.path.join(local_settings.RESULTSDIR,
                                                                'elmdict_' + host + suffix),
                                                   float(0))

tmp_input = 'plots/for_aydin_2/cos_host_host' + suffix + '.tab'
with open(tmp_input, 'w') as f:
    f.write('Host1\tHost2\tDistance\n')
    for i in xrange(len(species)):
        for j in xrange(len(species)):
            if i != j:
                host1 = species[i]
                host2 = species[j]
                sum = float(0)
                elms_compared = 0
                for elm in utils_graph.unionLists([host2elmFreqs[host1],
                                                   host2elmFreqs[host2]]):
                    for h in (host1, host2):
                        if elm not in host2elmFreqs[h]:
                            host2elmFreqs[h][elm] = {}
                    if len(host2elmFreqs[host1][elm].keys()) != 0 and len(host2elmFreqs[host2][elm].keys()) != 0:
                        sum += utils.getDistance(host2elmFreqs[host1][elm],
                                                 host2elmFreqs[host2][elm])
                        elms_compared += 1
                f.write('%s\t%s\t%.10f\n'
                        % (short_names[host1], 
                           short_names[host2], 
                           float(sum)/float(elms_compared)))
           
out_file = 'plots/for_aydin_2/cos_dis_sum_host' + suffix + '.png'
tmp_r = 'tmp_r' + str(random.randint(0,100))
with open(tmp_r, 'w') as f:
Example #5
0
                                     common_mammal_controled)
utils_graph.dumpNodes('mammal_controled' + str(cut), use_mammal_controled)
utils_graph.dumpNodes('bird_controled' + str(cut), use_bird_controled)
utils_graph.dumpNodes('common_controled' + str(cut), common_all_elms_controled)

for k in use_mammal_controled.keys():
    if 'FAIL' in k:
        del use_mammal_controled[k]
for k in use_bird_controled.keys():
    if 'FAIL' in k:
        del use_bird_controled[k]
for k in common_all_elms_controled.keys():
    if 'FAIL' in k:
        del common_all_elms_controled[k]

test_elms = utils_graph.unionLists([use_mammal, use_bird]) 
virus_elms_same = 0
virus_elm_count = 0
non_virus_elms_same = 0
with open('mammal_bird.different.' + str(cut) + '.test', 'w') as f:
    for elm in test_elms:
        if elm not in ignore_elms:
            count,same = test_it(elm, elm2freq, f)
            virus_elms_same += same
            virus_elm_count += count

non_virus_elms = 0
non_virus_elms_all = 0
control_elms = {}
with open('mammal_bird.different.' + str(cut) + '.notest', 'w') as f:       
    for elm in elm2freq:
Example #6
0
""" What is the Jensen-shannon distance
    between 2 flu groups?
"""
import utils, sys, utils_graph

flu_group_1_file = sys.argv[1]
flu_group_2_file = sys.argv[2]

flu_counts = {}
seen_elmSeqs = {}
seen_seqs_ls = []
for name, file in (('g1', flu_group_1_file),
                   ('g2', flu_group_2_file)):
    utils.count_flu_sampled(name, file,
                            flu_counts,
                            seen_elmSeqs, {}, False)
    seen_seqs_ls.append(seen_elmSeqs[name])
use_elmSeqs = utils_graph.unionLists(seen_seqs_ls)
flu_vecs = utils.mk_count_vecs(flu_counts, use_elmSeqs)
flu_dists = utils.mk_count_dists(flu_vecs)
js_dis = utils.jensen_shannon_dists(flu_dists['g1'],
                                    flu_dists['g2'])
print js_dis
    

Example #7
0
            for hp in h2_noRestrictions.keys():
                for hp_neigh in network[hp].keys():
                    for cd in matching_cds.keys():
                        if human_cd2protein[cd].has_key(hp_neigh):
                            h1[hp_neigh] = True
                            h2[hp] = True
                            h1_to_h2[hp_neigh + ":" + hp] = True
            for vp in virus_elm2protein[elm].keys():
                for pred in h1.keys():
                    if version2geneid.has_key(pred):
                        f.write(vp + "\t" + elm + "\t" + version2geneid[pred] + "\th1\n")
                for pred in h2.keys():
                    if version2geneid.has_key(pred):
                        f.write(vp + "\t" + elm + "\t" + version2geneid[pred] + "\th2\n")
                for pred in utils_graph.unionLists([h1, h2]).keys():
                    if version2geneid.has_key(pred):
                        f.write(vp + "\t" + elm + "\t" + version2geneid[pred] + "\th1h2\n")
                for pred in h2_noRestrictions.keys():
                    if version2geneid.has_key(pred):
                        f.write(vp + "\t" + elm + "\t" + version2geneid[pred] + "\th2All\n")
                for pair in h1_to_h2.keys():
                    if not vp_to_h1_to_h2.has_key(vp):
                        vp_to_h1_to_h2[vp] = {}
                    vp_to_h1_to_h2[vp][pair] = True

with open(outf2, "w") as f:
    for vp in vp_to_h1_to_h2.keys():
        for pair in vp_to_h1_to_h2[vp]:
            [h1_gene, h2_gene] = pair.split(":")
            if version2geneid.has_key(h1_gene) and version2geneid.has_key(h2_gene):
    mapping = {}
hosts = global_settings.TEST_GENOMES
#all_elmSeqs = {}
flus = ('human',)
flu_counts = {}
seen_seqs = {}
seen_seqs_ls = []

for flu in flus:
    flu_elm_file = os.path.join('results/',
                                flu + '.H5N1.elms')
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts,
                            seen_seqs, mapping, do_clustering)
    seen_seqs_ls.append(seen_seqs[flu])
if len(seen_seqs_ls) > 1:
    all_elmSeqs = utils_graph.unionLists(seen_seqs_ls)
else:
    all_elmSeqs = seen_seqs_ls[0]

host_counts = utils.count_host_elmSeqs(hosts, do_clustering, mapping,
                                       elm_count_dir)
                                                   
host_found_seqs = utils_graph.unionLists([host_counts['H_sapiens'],
                                          host_counts['Gallus_gallus']])
use_seqs = utils_graph.unionLists([all_elmSeqs, host_found_seqs])

flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)                   
host_vecs = utils.mk_count_vecs(host_counts, use_seqs)
host_dists = utils.mk_count_dists(host_vecs)
flu_dists = utils.mk_count_dists(flu_vecs)