def findEnrichedAnnotations_file(fgnd_file, bgnd_file, annotation_file, motifs): """ Check to see if the given motifs are enriched in the fg, as compared to the bg. Return motif {}, each entry is [#of proteins with motif, |fg|, # of times bg permutation is as good as fg]. Do 1000 background permutations. @param fgnd_file: file of foreground genes @param bgnd_file: file of backgound genes @param annotation_file: one annotation per line; @param motifs: {} of motifs you are checking for enrichment @return: [#of proteins with motif, |fg|, # of times bg is as good as fg] for each motif """ fg_ls = utils_graph.getNodes(fgnd_file) bg_ls = utils_graph.getNodes(bgnd_file) annotations = utils_motif.protein2annotation_forMotifs(annotation_file, motifs) return findEnrichedAnnotations(fs_ls, bg_ls, annotations, motifs)
def annotate(entrez_gene_ls_file, gene2go_file): genes = utils_graph.getNodes(entrez_gene_ls_file) tax_id = getTaxID(genes, gene2go_file) informative_terms = getInformativeTerms(tax_id, 0) gene2go = parseGene2GO(gene2go_file, genes) termId2termName = getTermIDtoGOterm() child2Parents = getChild2Parents(tax_id, termId2termName) getGOAnnotations(tax_id, gene2go, child2Parents) termID2term = term_id2term() distances = getMaxDistanceFromRoot() for protein in gene2go.keys(): for category in gene2go[protein].keys(): for goTerm in gene2go[protein][category].keys(): print protein + '\t' + termID2term[goTerm].keys()[0] + '\t' + str(distances[goTerm]) + '\t' + category
def mkFASTAfromFile(geneLsFile): genes = utils_graph.getNodes(geneLsFile) query = '' count = 0 for gene in genes: query = query + gene + ',' count += 1 if count % 500 == 0: wget_name = 'ncbi.query_' + str(count/500) wget_files.append(wget_name) wget_fasta(query, wget_name) query = '' wget_fasta(query, wget_name) wget_name = 'ncbi.query_' + str(count/500 + 1) wget_fasta(query, wget_name) wget_files.append(wget_name) for f in wget_files: parseWget(f, fout)
def prepAndColor(pathway, gene_file, html_color): """ Color these genes for this pathway. Does not color Cell Communication (path:hsa01430). @param pathway: KEGG pathway, format path:hsa04010 @param gene_file: one gene per line @param html_color_1: color, html format """ gene_dict = utils_graph.getNodes(gene_file) obj_ls = [] fore_gnd = [] back_gnd = [] for gene in gene_dict.keys(): obj_ls.append(gene) fore_gnd.append('black') back_gnd.append(html_color) wsdl = 'http://soap.genome.jp/KEGG.wsdl' serv = WSDL.Proxy(wsdl) url = serv.color_pathway_by_objects(pathway, obj_ls, fore_gnd, back_gnd) name = pathway.split(':')[-1] + '_' + html_color os.system('wget --output-document=' + name + ' ' + url) return name
""" Make a plot of host ELM sequence frequencies for uniq influenza ELM sequences.""" from collections import defaultdict import utils, os, utils_graph good_elms = utils_graph.getNodes('working/Jul7/good_phylogeny_elms') def write_file(fname, uniq, this_host_freqs, that_host_freqs, this, that): used = {} with open(fname, 'w') as f: f.write('ELM\tSeq\tHost\tFreq\n') for elmseq in uniq: protein, elm, seq = elmseq.split(':') #new_seq = utils.mk_sub(seq) #new_seq = seq key = elm + ':' + seq #seq = new_seq if 'LIG' in key and key not in used and elm in good_elms: used[key] = True this_val = float(0) that_val = float(0) if key in this_host_freqs: this_val = this_host_freqs[key] if key in that_host_freqs: that_val = that_host_freqs[key] diffpos = max([float(0), this_val - that_val]) diffneg = max([float(0), that_val - this_val]) if this_val and that_val: # f.write('%s\t%s\t%s\t%.10f\n' % # (elm, seq, this, this_val))
to pick an arbitrary cutoff, and look at hub enrichment compared to all hits in the file. All hits is not the best way to go; really I need to ~1700 genes that were tested & could be knocked down w/o killing the cell, but these are not available. """ import utils_stats, utils_graph flu_rnai_file = '../Thesis/Data/Network/Flu/cell09/all_rnai' hubs_file = '../Thesis/Data/Hubs2/HPRD.entrez.expand.hubs20' net_file = '../Thesis/Data/Network/Human/HPRD/hprd_new.intr.ls.entrez' network_genes = set(utils_graph.getNodes(net_file)) hubs = set(utils_graph.getNodes(hubs_file)) all_rnai = {} replication_rnai = {} with open(flu_rnai_file) as f: for line in f: [entrez, delNS1, vRNA, replication] = [float(x) for x in line.strip().split('\t')] ID = str(int(entrez)) if vRNA < float(0): replication_rnai[ID] = True all_rnai[ID] = True bg = set(network_genes & set(all_rnai.keys())) rep_set = set(replication_rnai.keys())
""" For the input ELMs, take the JS divergence for chicken/human H5N1 to human & chicken. For which ELMs does the hypothesis holds. Sample equally from flus to avoid biases. """ import sys, utils, os, utils_graph from collections import defaultdict elm_file = sys.argv[1] working_elms = utils_graph.getNodes(elm_file) flu_counts = {} seen_seqs = {} seen_seqs_ls = [] elm2seqs = defaultdict(dict) flus = ("human", "chicken") for flu in flus: # flu_elm_file = os.path.join('results', # flu + '.H5N1.elms') if "human" in flu: flu_elm_file = os.path.join("working/Jul1_year", flu + ".H3N2.2008.elms") else: flu_elm_file = os.path.join("working/Jul1_year/", flu + ".H5N1.2006.elms") utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, {}, False) for elmseq in seen_seqs[flu]: elm, seq = elmseq.split(":") elm2seqs[elm][elmseq] = True counts = utils.count_host_elmSeqs(("Gallus_gallus", "H_sapiens"), False, {}, "working/Jun29/", working_elms, ".init")
import utils_motif, utils_graph use_elms = utils_graph.getNodes('use_elms') human = utils_motif.protein2annotation('human.H1N1.elms', {'ELM':True}) human_conserved = utils_motif.protein2annotation('human.H1N1.elms.90', {'ELM':True}) swine = utils_motif.protein2annotation('swine.H1N1.elms', {'ELM':True}) swine_conserved = utils_motif.protein2annotation('swine.H1N1.elms.90', {'ELM':True}) def get_entropy(afile): entropy = {} with open(afile) as f: for line in f: [elm, entropy_st] = line.strip().split('\t') if not elm in entropy: entropy[elm] = {} entropy[elm] = float(entropy_st) return entropy def get_best_seq(seqs): ls = [] for seq in seqs: ls.append([seqs[seq],seq]) ls.sort() #if len(ls) > 1: # print ls[0], ls[1]
#!/usr/bin/env python """For each HCV protein, calcuate the likelyhood of the GO BP similarity between predictions and gold standard. Do this for H1H2 & H1. """ import sys, utils_stats, utils_graph, utils_humanVirus, random, os hhe_file = sys.argv[1] hhp_file = sys.argv[2] background_file = sys.argv[3] out_file = sys.argv[4] # this takes a long time # utils_stats.gene_set_go_sim(background_file, 'results/HPRD.ls.entrez.gosim') hhe_vp2hp = utils_humanVirus.loadHHETargetPairs(hhe_file) pred2vp2hp = utils_humanVirus.loadPredictions_predType2vp2hp(hhp_file) all_hps = utils_graph.getNodes(background_file) for pred_type in ('h1', 'h1h2'): for vp in pred2vp2hp[pred_type].keys(): if hhe_vp2hp.has_key(vp): hhe = utils_graph.intersectLists([hhe_vp2hp[vp], all_hps]).keys() preds = pred2vp2hp[pred_type][vp].keys() go_pval = utils_stats.gene_set_go_sim_pval(preds, hhe, 'results/HPRD.ls.entrez.gosim') print('%s\t%s\t%.3f' % (vp, pred_type, go_pval))
""" Are the 295 genes from the chandra 2010 nature paper enriched in HPRD hubs? """ import utils_graph, utils_stats rnai_file = '../Thesis/Data/Network/Flu/nature2010/rnai_hits' hubs_file = '../Thesis/Data/Hubs2/HPRD.entrez.expand.hubs20' net_file = '../Thesis/Data/Network/Human/HPRD/hprd_new.intr.ls.entrez' party_file = '../Thesis/Data/Hubs2/2.party' date_file = '../Thesis/Data/Hubs2/2.date' rnai_genes = set(utils_graph.getNodes(rnai_file)) network_genes = set(utils_graph.getNodes(net_file)) hubs = set(utils_graph.getNodes(hubs_file)) party = set(utils_graph.getNodes(party_file)) date = set(utils_graph.getNodes(date_file)) print len(party & rnai_genes), len(date & rnai_genes)
"ELM", "../../Data/ELM/Human/human.website.elm", "ELM", "../../Data/ProfileScan/all.ProfileScan.scanHPRD.notNCBI", "ProfileScan", "../../Data/Network/Human/HPRD/hprd.intr", "../../Data/human.hprd.prosite", "../../Data/Network/Human/HPRD/version2entrezgeneid", "../../Data/Binding_Relations/ELM.ProfileScan.pairs", "some out 1", "some out 2", ] utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True) virus_elm2protein = utils_motif.annotation2protein(sys.argv[1], {sys.argv[2]: True}) study_hps = utils_graph.getNodes(sys.argv[8]) human_elm2protein = utils_motif.annotation2protein_forProteins(sys.argv[3], {sys.argv[4]: True}, study_hps) human_cd2protein = utils_motif.annotation2protein_forProteins(sys.argv[5], {sys.argv[6]: True}, study_hps) network = utils_graph.getEdges(sys.argv[7]) version2geneid = utils_humanVirus.get_version2entrez(sys.argv[9]) elm2cd = utils_humanVirus.get_elm2prosites(sys.argv[10]) outf1 = sys.argv[11] outf2 = sys.argv[12] vp_to_h1_to_h2 = {} with open(outf1, "w") as f: for elm in virus_elm2protein.keys(): if human_elm2protein.has_key(elm): h2_noRestrictions = human_elm2protein[elm] h2 = {} h1 = {}
"""Use Jensen-Shannon divergence to make a dendrogram for eukaryotic hosts. Choose to cluster the ELM sequences before calculating JS divergence. To skip clusteirng, enter NA as the first argument. Otherwise, enter a closest flu distance file computed by flu_project_host_flu_closest.py. """ import itertools, sys, os, utils, random, global_settings, numpy, utils_plot, utils_graph from collections import defaultdict results_dir = sys.argv[1] # working/runs/Jun24/ out_file = sys.argv[2] f = os.path.join(results_dir, 'test_host_seqs') use_seqs = utils_graph.getNodes(f) counts = utils.count_host_seqs(global_settings.PLT_GENOMES, results_dir, use_seqs, '.init') ls = [] for host in counts: ls.append(counts[host]) all_elmSeqs = {} for host in counts: for elmSeq in counts[host]: all_elmSeqs[elmSeq] = True host_vecs = utils.mk_count_vecs(counts, all_elmSeqs) host_dists = utils.mk_count_dists(host_vecs) utils_plot.phylogeny_js(os.path.join(results_dir,
"../../Data/prosite.id2name", "../../Data/ProfileScan/MyLists/" "outfile", ] utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True) elm2prosite = utils_humanVirus.get_elm2prosites(sys.argv[1]) cd2nameFile = sys.argv[2] mylistdir = sys.argv[3] outfile = sys.argv[4] cd2name = {} f = open(cd2nameFile) for line in f: [id, name] = line.strip().split("\t") cd2name[id] = name f.close() elms = elm2prosite.keys() elms.sort() with open(outfile, "w") as f: f.write("ELM\tBinding PROSITE or Entrez Gene IDs\n") for elm in elms: for cd in elm2prosite[elm].keys(): if cd2name.has_key(cd): f.write(elm + "\t" + cd2name[cd] + "\n") else: genes = utils_graph.getNodes(mylistdir + cd) genes_to_print = "" for gene in genes.keys(): genes_to_print = genes_to_print + gene + ";" f.write(elm + "\t" + genes_to_print.strip(";") + "\n")
"""Look up stats for the 35 ELM sequences unique to mammal flu.""" import utils_graph mU = utils_graph.getNodes("working/Jul1_year/mU") species = ("chicken", "duck", "swine", "human", "equine")
for each vp and all. """ import utils_scripting, utils_humanVirus, utils_graph, sys, utils_stats req_args = ["niaid triplet file", "prediction file", "human proteins in study", "output file"] examples = [ "../../Runs/Clustering.domain.s/all_niaid_triplets", "../../Runs/Conservation70_Cutoff.2_Window10", "../../Data/human.hprd.prosite", "some out file", ] utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True) hhe_vp2hp = utils_humanVirus.loadHHETargetPairs(sys.argv[1]) pred2vp2hp = utils_humanVirus.loadPredictions_predType2vp2hp(sys.argv[2]) all_hps = utils_graph.getNodes(sys.argv[3]) with open(sys.argv[4], "w") as fout: fout.write("Prediction Type\tVP\tHHE\tHHP\tMatch\tPrecsion\tRecall\tRandomPrecision\tPval\n") for predtype in pred2vp2hp.keys(): for vp in pred2vp2hp[predtype].keys(): if hhe_vp2hp.has_key(vp): hhe = utils_graph.intersectLists([hhe_vp2hp[vp], all_hps]) hhe_len = len(hhe.keys()) preds = pred2vp2hp[predtype][vp] preds_len = len(preds.keys()) match = utils_graph.intersectLists([hhe, preds]) match_len = len(match.keys()) precision = int(round(float(100) * float(match_len) / float(preds_len))) if hhe_len > 0: