Esempio n. 1
0
def seqDistance(virus_d, host_d):
    seqs = utils_graph.intersectLists([virus_d,
                                       host_d])
    sum = float(0)
    for seq in seqs:
        sum += virus_d[seq]
    return sum
Esempio n. 2
0
def getConservedELMs(virus, subtypes):
    ls = [utils_motif.annotation2protein(os.path.join(local_settings.RESULTSDIR,
                                                      virus + '.' + subtype 
                                                      + '.elms.70.controled'),
                                         {'ELM':True}) 
          for subtype in subtypes[virus]]
    return utils_graph.intersectLists(ls)
Esempio n. 3
0
def stEvalInfo(d1, d2, bg):
    """ Compute Match,Pval between d1 & d2.
        Return as string. """

    match = utils_graph.intersectLists([d1, d2])
    pval = utils_stats.prob3(len(bg.keys()),
                             len(d1.keys()),
                             len(d2.keys()),
                             len(match.keys()))
    return str(len(d1.keys())) + '\t' + str(len(match.keys())) + '\t' + str(pval)
Esempio n. 4
0
def getEnrichedClusters(gmeans_clusters, target_sets, background_genes, pval_cut):
    """ Return {} of cluster_id to [enriched_cat, enriched_genes {}, pval]
        target_sets is {} of cat to genes.
    """

    # check to make sure everything is in background
    for cluster in gmeans_clusters.keys():
        for gene in gmeans_clusters[cluster].keys():
            if not background_genes.has_key(gene):
                print 'cluster gene not in background genes'
                sys.exit(0)
    for target in target_sets.keys():
        for gene in target_sets[target].keys():
             if not background_genes.has_key(gene):
                print 'target set gene not in background genes'
                sys.exit(0)

    ret_clusters = {}
    background_len = len(background_genes.keys())
    for a_set in target_sets.keys():
        target_genes = target_sets[a_set]
        target_len = len(target_genes.keys())
        for cluster in gmeans_clusters.keys():
            cluster_genes = gmeans_clusters[cluster]
            cluster_len = len(cluster_genes.keys())
            match_genes = utils_graph.intersectLists([cluster_genes,
                                                       target_genes])
            match_len = len(match_genes.keys())
            pval = utils_stats.prob3(background_len,
                                     cluster_len,
                                     target_len,
                                     match_len)
            if pval < pval_cut:
                if not ret_clusters.has_key(cluster):
                    ret_clusters[cluster] = {}
                cluster_per = int(float(100)*float(match_len)/float(cluster_len))
                target_per = int(float(100)*float(match_len)/float(target_len))
                ret_clusters[cluster][a_set] = [match_genes, cluster_per, target_per, pval]
    return ret_clusters
Esempio n. 5
0
#                 print elm

# with open('mammal_bird.notTest', 'w') as f:
#     for elm in control_elms:
#         if not elm in test_elms:
#             if elm in elm2fracs:
#                 if check_gtr(elm, elm2fracs):
#                     f.write(elm + '\tGTR\n')
#                 elif check_less(elm, elm2fracs):
#                     f.write(elm + '\tLESS\n')
#                 else:
#                     f.write(elm + '\tSAME\n')

test_elms = {}
with open('mammal_bird.' + cut + '.test', 'w') as f:
    for elm in utils_graph.intersectLists([use_elms,freq_elms]):
         if elm in mammal_elms and elm in bird_elms:
                     control_elms[elm] = True
         elif elm in elm2fracs:
             test_elms[elm] = True
             if check_gtr(elm, elm2fracs):
                 f.write(elm + '\tGTR\n')
             elif check_less(elm, elm2fracs):
                 f.write(elm + '\tLESS\n')
             else:
                 f.write(elm + '\tSAME\n')
         else:
             test_elms[elm] = True
             if check_gtr(elm, elm2freq):
                 f.write(elm + '\tGTR\n')
             elif check_less(elm, elm2freq):
Esempio n. 6
0
non_virus_elms = 0
non_virus_elms_all = 0
control_elms = {}
with open('mammal_bird.different.' + str(cut) + '.notest', 'w') as f:       
    for elm in elm2freq:
        if not elm in test_elms:
            control_elms[elm] = True
            non_virus_elms_all += 1
            if not elm in ignore_elms:
                count,same = test_it(elm, elm2freq, f)
                non_virus_elms += count
                non_virus_elms_same += same            
                
diff_diff = virus_elm_count-virus_elms_same
diff_same = virus_elms_same + len(test_elms.keys())-virus_elm_count-len(utils_graph.intersectLists([test_elms,ignore_elms]))
diff_bg_diff = non_virus_elms-non_virus_elms_same
diff_bg_same = non_virus_elms_same + non_virus_elms_all-non_virus_elms-len(utils_graph.intersectLists([control_elms,ignore_elms]))
with open(str(cut) + '.different.results', 'w') as f:
    p = utils_stats.fisher_positive_pval([diff_diff,diff_same],
                                         [diff_bg_diff,diff_bg_same])
    f.write('pvalue\t' + str(p) + '\n')
    f.write('virus\t' + str(diff_diff)
            + '\t' + str(diff_same) + '\n')
    f.write('nvirus\t' + str(diff_bg_diff)
            + '\t' + str(diff_bg_same) + '\n')

test_elms_2 = {}
for c in common_all_elms:
    if not c in test_elms:
        test_elms_2[c] = True
Esempio n. 7
0
#!/usr/bin/env python

"""For each HCV protein, calcuate the likelyhood
   of the GO BP similarity between predictions
   and gold standard. Do this for H1H2 & H1.
"""
import sys, utils_stats, utils_graph, utils_humanVirus, random, os

hhe_file = sys.argv[1]
hhp_file = sys.argv[2]
background_file = sys.argv[3]
out_file = sys.argv[4]

# this takes a long time
# utils_stats.gene_set_go_sim(background_file, 'results/HPRD.ls.entrez.gosim')

hhe_vp2hp = utils_humanVirus.loadHHETargetPairs(hhe_file)
pred2vp2hp = utils_humanVirus.loadPredictions_predType2vp2hp(hhp_file)
all_hps = utils_graph.getNodes(background_file)

for pred_type in ('h1', 'h1h2'):
    for vp in pred2vp2hp[pred_type].keys():
        if hhe_vp2hp.has_key(vp):
            hhe = utils_graph.intersectLists([hhe_vp2hp[vp], all_hps]).keys()
            preds = pred2vp2hp[pred_type][vp].keys()
            go_pval = utils_stats.gene_set_go_sim_pval(preds, hhe,
                                                       'results/HPRD.ls.entrez.gosim')
            print('%s\t%s\t%.3f' %
                  (vp, pred_type, go_pval))
Esempio n. 8
0
            elmSeq = elm + ':' + seq
            if elm == 'TRG_ENDOCYTIC_2':
                all_elmSeqs[elmSeq] = True
                flu_counts[flu][elmSeq] += 1
print flu_counts
host_all_Seqs = {}
for host in hosts:
    host_counts[host] = defaultdict(utils.init_zero)
    with open('working/Jun29/elmdict_' + host + '.init') as f:
        for line in f:
            (elm, seq, count, fq) = line.strip().split('\t')
            elmSeq = elm + ':' + seq
            if elm == 'TRG_ENDOCYTIC_2':
                #all_elmSeqs[elmSeq] = True
                host_all_Seqs[elmSeq] = True
                host_counts[host][elmSeq] += int(count)
print len(utils_graph.intersectLists([host_all_Seqs,
                                      all_elmSeqs]))
flu_vecs = mk_count_vecs(flu_counts, all_elmSeqs)
flu_dists = mk_count_dists(flu_vecs)
host_vecs = mk_count_vecs(host_counts, all_elmSeqs)
host_dists = mk_count_dists(host_vecs)

js_distances = defaultdict(dict)
for host in hosts:
    for flu in flus:
        js_dis = utils.jensen_shannon_dists(host_dists[host],
                                            flu_dists[flu])
        js_distances[host][flu] = js_dis
        print host, flu, js_dis
Esempio n. 9
0
        for line in f:
            (elm, seq, count, fq) = line.strip().split('\t')
            elmSeq = elm + ':' + seq
            if elm in mapping:
                if elmSeq in mapping[elm]:
                    key = mapping[elm][elmSeq]
                    host_counts[host][key] += int(count)
                    found_seqs[-1][key] = True
               #  else:
            #         host_counts[host][elmSeq] += int(count)
            #         found_seqs[-1][elmSeq] = True
            # else:
            #     host_counts[host][elmSeq] += int(count)
            #     found_seqs[-1][elmSeq] = True

use_seqs = utils_graph.intersectLists(found_seqs)
host_vecs = mk_count_vecs(host_counts, use_seqs)
host_dists = mk_count_dists(host_vecs)

tmp_input = 'tmp_data'
tmp_r = 'tmp_r' + str(random.randint(0,100))
tmp_labels = 'labels' + str(random.randint(0,100))
out_file = 'working/try_clusters.png'

js_distances = defaultdict(dict)
for host1, host2 in itertools.combinations(hosts, 2):
    js_dis = utils.jensen_shannon_dists(host_dists[host1],
                                        host_dists[host2])
    js_distances[host1][host2] = js_dis
    js_distances[host2][host1] = js_dis
Esempio n. 10
0
    "../../Runs/Conservation70_Cutoff.2_Window10",
    "../../Data/human.hprd.prosite",
    "some out file",
]
utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)

hhe_vp2hp = utils_humanVirus.loadHHETargetPairs(sys.argv[1])
pred2vp2hp = utils_humanVirus.loadPredictions_predType2vp2hp(sys.argv[2])
all_hps = utils_graph.getNodes(sys.argv[3])

with open(sys.argv[4], "w") as fout:
    fout.write("Prediction Type\tVP\tHHE\tHHP\tMatch\tPrecsion\tRecall\tRandomPrecision\tPval\n")
    for predtype in pred2vp2hp.keys():
        for vp in pred2vp2hp[predtype].keys():
            if hhe_vp2hp.has_key(vp):
                hhe = utils_graph.intersectLists([hhe_vp2hp[vp], all_hps])

                hhe_len = len(hhe.keys())
                preds = pred2vp2hp[predtype][vp]
                preds_len = len(preds.keys())
                match = utils_graph.intersectLists([hhe, preds])
                match_len = len(match.keys())
                precision = int(round(float(100) * float(match_len) / float(preds_len)))
                if hhe_len > 0:
                    recall = int(round(float(100) * float(match_len) / float(hhe_len)))
                else:
                    recall = "NA"
                random_precision = int(round(float(100) * float(hhe_len) / float(len(all_hps.keys()))))
                if match_len != 0:
                    pval = utils_stats.prob3(len(all_hps.keys()), preds_len, hhe_len, match_len)
                else: