def test_update_association(): """Compare new propagate cnts function with original function. Test assc results is same.""" print('\n1) READ GODAG:') assc_name = "goa_human.gaf" # gene_association.fb gene_association.mgi obo = os.path.join(REPO, "go-basic.obo") tic = timeit.default_timer() godag = get_godag(obo) tic = prt_hms(tic, "Created two GODags: One for original and one for new propagate counts") print('\n2) READ ANNOTATIONS:') assc_orig = dnld_assc(os.path.join(REPO, assc_name), godag) tic = prt_hms(tic, "Associations Read") objanno = get_objanno(os.path.join(REPO, assc_name), 'gaf', godag=godag) tic = prt_hms(tic, "Associations Read") print('\n3) MAKE COPIES OF ASSOCIATIONS:') assc1 = {g:set(gos) for g, gos in assc_orig.items()} assc2 = {g:set(gos) for g, gos in assc_orig.items()} tic = prt_hms(tic, "Associations Copied: One for original and one for new") print('\n4) UPDATE ASSOCIATIONS (PROPAGATE COUNTS):') godag.update_association(assc1) tic = prt_hms(tic, "ORIG: godag.update_association(assc)") update_association(assc2, godag) tic = prt_hms(tic, "NEW SA: update_association(go2obj, assc_orig)") assc3 = objanno.get_id2gos(namespace='BP', propagate_counts=True) tic = prt_hms(tic, "NEW BASE: update_association(go2obj, assc_orig)") print('\n5) RUN CHECKS') _chk_assc(assc1, assc2) _chk_assc(assc1, assc3) _chk_godag(godag, obo)
def __init__(self): self.prt = sys.stdout _fin_assc = os.path.join(REPO, "goa_human.gaf") self.gene2gos_orig = dnld_assc(_fin_assc, go2obj=None, prt=self.prt) self.go2genes_orig = get_b2aset(self.gene2gos_orig) _num_genes = [len(gs) for gs in self.go2genes_orig.values()] self.min_genes = min(_num_genes) self.max_genes = max(_num_genes) assert self.gene2gos_orig == get_b2aset(self.go2genes_orig)
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" goids = [ "GO:0140101", "GO:0140097", "GO:0140096", "GO:0140098", "GO:0015318", "GO:0140110", ] # Get all the annotations from arabidopsis. associations = [ ('human', 'goa_human.gaf'), ('yeast', 'gene_association.sgd'), ] cwd = os.getcwd() # current working directory godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) for species, assc_name in associations: # Limit test numbers for speed print() # Get all the annotations for the current species assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), godag, prt=None) # Calculate the information content of the single term, GO:0048364 termcounts = TermCounts(godag, assc_gene2gos) # Print information values for each GO term for goid in sorted(goids): infocontent = get_info_content(goid, termcounts) print( '{SPECIES} Information content {INFO:8.6f} {GO} {NAME}'.format( SPECIES=species, GO=goid, INFO=infocontent, NAME=godag[goid].name)) # Print semantic similarities between each pair of GO terms print("GO #1 GO #2 Resnik Lin") print("---------- ---------- ------ -------") for go_a, go_b in itertools.combinations(sorted(goids), 2): # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in the GO. sim_r = resnik_sim(go_a, go_b, godag, termcounts) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_a, go_b, godag, termcounts) print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format(GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l)) assert sim_r, "FATAL RESNIK SCORE" assert sim_l, "FATAL LIN SCORE"
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) dca = deepest_common_ancestor([go_id3, go_id4], godag) assert dca == NS2GO['BP'] assert sim_r == get_info_content(dca, termcounts) assert sim_r == 0.0 print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = 0.0 because they are similar through BP top sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l == 0.0, "FATAL LIN SCORE" # go_top_cc = NS2GO['CC'] sim_r = resnik_sim(go_top_cc, go_top_cc, godag, termcounts) assert sim_r == 0.0 sim_l = lin_sim(go_top_cc, go_top_cc, godag, termcounts) assert sim_l == 1.0
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc( os.path.join(os.getcwd(), 'gene_association.tair'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'. format(GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) assert sim_r, "FATAL RESNIK SCORE" # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l, "FATAL LIN SCORE"
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" goids = [ "GO:0140101", "GO:0140097", "GO:0140096", "GO:0140098", "GO:0015318", "GO:0140110", ] # Get all the annotations from arabidopsis. associations = [ ('human', 'goa_human.gaf'), ('yeast', 'sgd.gaf'), ] godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) for species, assc_name in associations: # Limit test numbers for speed print() # Get all the annotations for the current species fin_assc = os.path.join(REPO, assc_name) assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None) # Calculate the information content of the single term, GO:0048364 termcounts = TermCounts(godag, assc_gene2gos) # Print information values for each GO term for goid in sorted(goids): infocontent = get_info_content(goid, termcounts) term = godag[goid] print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format( SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name)) # Print semantic similarities between each pair of GO terms print("GO #1 GO #2 Resnik Lin") print("---------- ---------- ------ -------") for go_a, go_b in itertools.combinations(sorted(goids), 2): # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in the GO. sim_r = resnik_sim(go_a, go_b, godag, termcounts) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_a, go_b, godag, termcounts) print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format( GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l)) assert sim_r, "FATAL RESNIK SCORE" assert sim_l, "FATAL LIN SCORE"
def describe_assc(org, fin_assc, go2obj, obj, prt): """Report statistics for a single association.""" # Assc. | # Assc| range | 25th | median | 75th | mean | stddev # ------------|-------|------------|------|--------|------|------|------- # hsa GO/gene | 19394 | 1 to 212 | 5 | 9 | 17 | 13 | 14 # hsa gene/GO | 17277 | 1 to 8,897 | 1 | 3 | 8 | 15 | 120 # # mus GO/gene | 19870 | 1 to 261 | 5 | 10 | 18 | 14 | 15 # mus gene/GO | 17491 | 1 to 7,009 | 1 | 3 | 8 | 16 | 129 # # dme GO/gene | 12551 | 1 to 137 | 2 | 4 | 8 | 6 | 7 # dme gene/GO | 7878 | 1 to 1,675 | 1 | 3 | 7 | 10 | 41 gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations go2genes = get_b2aset(gene2gos) cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()] cnts_genes_p_go = [len(genes) for genes in go2genes.values()] obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt) obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
def test_semantic_i88(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") goids = set(go for go, o in godag.items() if go == o.id) goids = set(godag.keys()) # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, "tair.gaf") # dnld_assc includes read_gaf associations = dnld_assc(fin_gaf, godag, prt=None) # First get the counts and information content for each GO term. termcounts = TermCounts(godag, associations) gosubdag = GoSubDag(goids, godag, tcntobj=termcounts) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process go_root = deepest_common_ancestor([go_id3, go_id4], godag) sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'. format(GO1=go_id3, GO2=go_id4, VAL=sim)) gosubdag.prt_goids([go_root, go_id3, go_id4]) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
def test_semantic_similarity(usr_assc=None): """Computing basic semantic similarities between GO terms.""" not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'} associations = sorted(ASSOCIATIONS.difference(not_these)) go2obj = get_go2obj() # goids = go2obj.keys() # http://current.geneontology.org/annotations/ if usr_assc is not None: associations = [usr_assc] cwd = os.getcwd() not_found = set() for assc_name in associations: # Limit test numbers for speed tic = timeit.default_timer() # Get all the annotations from arabidopsis. assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), go2obj, prt=sys.stdout) if not assc_gene2gos: not_found.add(assc_name) continue # Calculate the information content of the single term, GO:0048364 # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(go2obj, assc_gene2gos) go_cnt = termcounts.gocnts.most_common() #print termcounts.gocnts.most_common() if go_cnt: print("{ASSC}".format(ASSC=assc_name)) print(sorted(termcounts.aspect_counts.most_common())) gocnt_max = go_cnt[0][1] prt_info(termcounts, go_cnt, None) prt_info(termcounts, go_cnt, gocnt_max / 2.0) prt_info(termcounts, go_cnt, gocnt_max / 10.0) print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name, HMS=_hms(TIC), hms=_hms(tic))) print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations))) if not_found: _prt_not_found(not_found)
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) assert sim_r, "FATAL RESNIK SCORE" # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l, "FATAL LIN SCORE"
def describe_assc(org, fin_assc, go2obj, obj, prt): """Report statistics for a single association.""" # Assc. | # Assc| range | 25th | median | 75th | mean | stddev # ------------|-------|------------|------|--------|------|------|------- # hsa GO/gene | 19394 | 1 to 212 | 5 | 9 | 17 | 13 | 14 # hsa gene/GO | 17277 | 1 to 8,897 | 1 | 3 | 8 | 15 | 120 # # mus GO/gene | 19870 | 1 to 261 | 5 | 10 | 18 | 14 | 15 # mus gene/GO | 17491 | 1 to 7,009 | 1 | 3 | 8 | 16 | 129 # # dme GO/gene | 12551 | 1 to 137 | 2 | 4 | 8 | 6 | 7 # dme gene/GO | 7878 | 1 to 1,675 | 1 | 3 | 7 | 10 | 41 gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations go2genes = get_b2aset(gene2gos) assert gene2gos assert go2genes cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()] cnts_genes_p_go = [len(genes) for genes in go2genes.values()] obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt) obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
def _test_path_bp_mf(branch_dist, godag, prt): """Test distances between BP branch and MF branch.""" go_mf = 'GO:0003676' # level-03 depth-03 nucleic acid binding [molecular_function] go_bp = 'GO:0007516' # level-04 depth-05 hemocyte development [biological_process] dst_none = semantic_distance(go_mf, go_bp, godag) sim_none = semantic_similarity(go_mf, go_bp, godag) assc = dnld_assc("gene_association.tair", godag) termcounts = TermCounts(godag, assc) fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n' sim_r = resnik_sim(go_mf, go_bp, godag, termcounts) sim_l = lin_sim(go_mf, go_bp, godag, termcounts) if prt is not None: prt.write( fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=dst_none)) prt.write( fmt.format(TYPE='semantic similarity', GO1=go_mf, GO2=go_bp, VAL=sim_none)) prt.write( fmt.format(TYPE='Resnik similarity', GO1=go_mf, GO2=go_bp, VAL=sim_r)) prt.write( fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l)) assert dst_none is None assert sim_none is None assert sim_r is None assert sim_l is None sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist) if prt is not None: prt.write( fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=sim_d)) assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist
def test_semantic_i88(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") goids = set(go for go, o in godag.items() if go == o.id) goids = set(godag.keys()) # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, "tair.gaf") # dnld_assc includes read_gaf associations = dnld_assc(fin_gaf, godag, prt=None) # First get the counts and information content for each GO term. termcounts = TermCounts(godag, associations) gosubdag = GoSubDag(goids, godag, tcntobj=termcounts) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process go_root = deepest_common_ancestor([go_id3, go_id4], godag) sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) gosubdag.prt_goids([go_root, go_id3, go_id4]) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_l))
def test_semantic_similarity(usr_assc=None): """Computing basic semantic similarities between GO terms.""" not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'} associations = sorted(ASSOCIATIONS.difference(not_these)) go2obj = get_go2obj() # goids = go2obj.keys() # http://current.geneontology.org/annotations/ if usr_assc is not None: associations = [usr_assc] cwd = os.getcwd() not_found = set() for assc_name in associations: # Limit test numbers for speed tic = timeit.default_timer() # Get all the annotations from arabidopsis. assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), go2obj, prt=sys.stdout) if not assc_gene2gos: not_found.add(assc_name) continue # Calculate the information content of the single term, GO:0048364 # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(go2obj, assc_gene2gos) go_cnt = termcounts.gocnts.most_common() #print termcounts.gocnts.most_common() if go_cnt: print("{ASSC}".format(ASSC=assc_name)) print(sorted(termcounts.aspect_counts.most_common())) gocnt_max = go_cnt[0][1] prt_info(termcounts, go_cnt, None) prt_info(termcounts, go_cnt, gocnt_max/2.0) prt_info(termcounts, go_cnt, gocnt_max/10.0) print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name, HMS=_hms(TIC), hms=_hms(tic))) print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations))) if not_found: _prt_not_found(not_found)
def test_semantic_similarity(usr_assc=None): """Computing basic semantic similarities between GO terms.""" go2obj = get_go2obj() # goids = go2obj.keys() associations = [ 'gene_association.GeneDB_Lmajor', 'gene_association.GeneDB_Pfalciparum', 'gene_association.GeneDB_Tbrucei', 'gene_association.GeneDB_tsetse', 'gene_association.PAMGO_Atumefaciens', 'gene_association.PAMGO_Ddadantii', #'gene_association.PAMGO_Mgrisea', # TBD Resolve DB_Name containing '|' 'gene_association.PAMGO_Oomycetes', 'gene_association.aspgd', 'gene_association.cgd', 'gene_association.dictyBase', 'gene_association.ecocyc', 'gene_association.fb', 'gene_association.gonuts', #'gene_association.gramene_oryza', # DB_Name 'gene_association.jcvi', 'gene_association.mgi', 'gene_association.pombase', 'gene_association.pseudocap', 'gene_association.reactome', 'gene_association.rgd', 'gene_association.sgd', 'gene_association.sgn', 'gene_association.tair', 'gene_association.wb', 'gene_association.zfin', 'goa_chicken.gaf', 'goa_chicken_complex.gaf', 'goa_chicken_isoform.gaf', 'goa_chicken_rna.gaf', 'goa_cow.gaf', 'goa_cow_complex.gaf', 'goa_cow_isoform.gaf', 'goa_cow_rna.gaf', 'goa_dog.gaf', 'goa_dog_complex.gaf', 'goa_dog_isoform.gaf', 'goa_dog_rna.gaf', 'goa_human.gaf', 'goa_human_complex.gaf', 'goa_human_isoform.gaf', 'goa_human_rna.gaf', 'goa_pdb.gaf', 'goa_pig.gaf', 'goa_pig_complex.gaf', 'goa_pig_isoform.gaf', 'goa_pig_rna.gaf', #'goa_uniprot_all.gaf', #'goa_uniprot_all_noiea.gaf', ] if usr_assc is not None: associations = [usr_assc] cwd = os.getcwd() for assc_name in associations: # Limit test numbers for speed # Get all the annotations from arabidopsis. assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), go2obj, prt=sys.stdout) # Calculate the information content of the single term, GO:0048364 # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(go2obj, assc_gene2gos) go_cnt = termcounts.gocnts.most_common() #print termcounts.gocnts.most_common() if go_cnt: print("\n{ASSC}".format(ASSC=assc_name)) print(sorted(termcounts.aspect_counts.most_common())) gocnt_max = go_cnt[0][1] prt_info(termcounts, go_cnt, None) prt_info(termcounts, go_cnt, gocnt_max / 2.0) prt_info(termcounts, go_cnt, gocnt_max / 10.0)
mean = [] num = [] df = pd.read_csv( '/sf/smpdata1/pronozinau/Blast_test/odb10v0_gene_xrefs_onlyGO.tab', sep='\t', header=None) df.columns = ['ort', 'GO', '3'] zipbO = zip(df['ort'].to_list(), df['GO'].to_list()) my_dict = defaultdict(list) for k, v in zipbO: my_dict[k].append(v) #my_dict = read_associations('/sf/smpdata1/pronozinau/GO_slim/GO_slim.csv', 'id2gos') godag = GODag("go-basic.obo") fin_gaf = os.path.join(os.getcwd(), "tair.gaf") associations = dnld_assc(fin_gaf, godag) termcounts = TermCounts(godag, associations) godag = GODag("go-basic.obo") #def find_csv_filenames( path_to_dir, suffix=".csv" ): # filenames = listdir(path_to_dir) # return [ filename for filename in filenames if filename.endswith( suffix ) ] #blat = find_csv_filenames("/storage/pronozinau/ALL_base_OtrhoDB/metout/group3_bla/", "csv") blat = pd.read_csv('/storage/pronozinau/OrthoDB/mono_sp.csv', sep=',') for w in blat['0']: try: clustal = pd.read_csv('/storage/pronozinau/OrthoDB/clustalw/group_3/' + w + '.csv', sep='\t', header=None) blast = pd.read_csv(