def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) dca = deepest_common_ancestor([go_id3, go_id4], godag) assert dca == NS2GO['BP'] assert sim_r == get_info_content(dca, termcounts) assert sim_r == 0.0 print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = 0.0 because they are similar through BP top sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l == 0.0, "FATAL LIN SCORE" # go_top_cc = NS2GO['CC'] sim_r = resnik_sim(go_top_cc, go_top_cc, godag, termcounts) assert sim_r == 0.0 sim_l = lin_sim(go_top_cc, go_top_cc, godag, termcounts) assert sim_l == 1.0
def prt_info(termcounts, go_cnt, max_val): """Print the information content of a frequently used GO ID.""" go_id, cnt = get_goid(go_cnt, max_val) infocontent = get_info_content(go_id, termcounts) msg = 'Information content ({GO} {CNT:7,}) = {INFO:8.6f} {NAME}' print( msg.format(GO=go_id, CNT=cnt, INFO=infocontent, NAME=termcounts.go2obj[go_id].name))
def resnik_sim_hdf5(go_id1, go_id2, godag, termcounts, hdf5): ''' Computes Resnik's similarity measure. ''' try: msca_goid = deepest_common_ancestor_hdf5( [goterm2id(go_id1), goterm2id(go_id2)], godag, hdf5) score = semantic.get_info_content(msca_goid, termcounts) except: score = -1 return score
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" goids = [ "GO:0140101", "GO:0140097", "GO:0140096", "GO:0140098", "GO:0015318", "GO:0140110", ] # Get all the annotations from arabidopsis. associations = [ ('human', 'goa_human.gaf'), ('yeast', 'gene_association.sgd'), ] cwd = os.getcwd() # current working directory godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) for species, assc_name in associations: # Limit test numbers for speed print() # Get all the annotations for the current species assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), godag, prt=None) # Calculate the information content of the single term, GO:0048364 termcounts = TermCounts(godag, assc_gene2gos) # Print information values for each GO term for goid in sorted(goids): infocontent = get_info_content(goid, termcounts) print( '{SPECIES} Information content {INFO:8.6f} {GO} {NAME}'.format( SPECIES=species, GO=goid, INFO=infocontent, NAME=godag[goid].name)) # Print semantic similarities between each pair of GO terms print("GO #1 GO #2 Resnik Lin") print("---------- ---------- ------ -------") for go_a, go_b in itertools.combinations(sorted(goids), 2): # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in the GO. sim_r = resnik_sim(go_a, go_b, godag, termcounts) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_a, go_b, godag, termcounts) print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format(GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l)) assert sim_r, "FATAL RESNIK SCORE" assert sim_l, "FATAL LIN SCORE"
def resnik_sim_pandas(tup, df, termcounts): ''' Computes Resnik's similarity measure. ''' go_id1, go_id2 = tup #print(df.head()) if go_id1 == go_id2: return semantic.get_info_content(go_id1, termcounts) elif go_id2 in df.index and go_id1 in df.index: ancestors = df.loc[str(go_id2)].parents ancestors += df.loc[str(go_id1)].parents terms = df.loc[ancestors] ancestors_set = terms.parents.tolist() intersection = set(ancestors_set[0]).intersection(*ancestors_set[1:]) common_ancestors = df.loc[list(intersection)] common_ancestors = common_ancestors.sort_values('depth', ascending=False) msca_goid = common_ancestors.index.tolist()[0] return semantic.get_info_content(msca_goid, termcounts) else: return -1
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc( os.path.join(os.getcwd(), 'gene_association.tair'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'. format(GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) assert sim_r, "FATAL RESNIK SCORE" # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l, "FATAL LIN SCORE"
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" goids = [ "GO:0140101", "GO:0140097", "GO:0140096", "GO:0140098", "GO:0015318", "GO:0140110", ] # Get all the annotations from arabidopsis. associations = [ ('human', 'goa_human.gaf'), ('yeast', 'sgd.gaf'), ] godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) for species, assc_name in associations: # Limit test numbers for speed print() # Get all the annotations for the current species fin_assc = os.path.join(REPO, assc_name) assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None) # Calculate the information content of the single term, GO:0048364 termcounts = TermCounts(godag, assc_gene2gos) # Print information values for each GO term for goid in sorted(goids): infocontent = get_info_content(goid, termcounts) term = godag[goid] print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format( SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name)) # Print semantic similarities between each pair of GO terms print("GO #1 GO #2 Resnik Lin") print("---------- ---------- ------ -------") for go_a, go_b in itertools.combinations(sorted(goids), 2): # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in the GO. sim_r = resnik_sim(go_a, go_b, godag, termcounts) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_a, go_b, godag, termcounts) print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format( GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l)) assert sim_r, "FATAL RESNIK SCORE" assert sim_l, "FATAL LIN SCORE"
def test_semantic_i88(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") goids = set(go for go, o in godag.items() if go == o.id) goids = set(godag.keys()) # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, "tair.gaf") # dnld_assc includes read_gaf associations = dnld_assc(fin_gaf, godag, prt=None) # First get the counts and information content for each GO term. termcounts = TermCounts(godag, associations) gosubdag = GoSubDag(goids, godag, tcntobj=termcounts) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process go_root = deepest_common_ancestor([go_id3, go_id4], godag) sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'. format(GO1=go_id3, GO2=go_id4, VAL=sim)) gosubdag.prt_goids([go_root, go_id3, go_id4]) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) assert sim_r, "FATAL RESNIK SCORE" # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l, "FATAL LIN SCORE"
def test_semantic_i88(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") goids = set(go for go, o in godag.items() if go == o.id) goids = set(godag.keys()) # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, "tair.gaf") # dnld_assc includes read_gaf associations = dnld_assc(fin_gaf, godag, prt=None) # First get the counts and information content for each GO term. termcounts = TermCounts(godag, associations) gosubdag = GoSubDag(goids, godag, tcntobj=termcounts) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process go_root = deepest_common_ancestor([go_id3, go_id4], godag) sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) gosubdag.prt_goids([go_root, go_id3, go_id4]) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_l))
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") # Get all the annotations from arabidopsis. associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz") # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
def prt_info(termcounts, go_cnt, max_val): """Print the information content of a frequently used GO ID.""" go_id, cnt = get_goid(go_cnt, max_val) infocontent = get_info_content(go_id, termcounts) msg = 'Information content ({GO} {CNT:7,}) = {INFO:8.6f} {NAME}' print(msg.format(GO=go_id, CNT=cnt, INFO=infocontent, NAME=termcounts.go2obj[go_id].name))