def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag)


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    dca = deepest_common_ancestor([go_id3, go_id4], godag)
    assert dca == NS2GO['BP']
    assert sim_r == get_info_content(dca, termcounts)
    assert sim_r == 0.0
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = 0.0 because they are similar through BP top
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    assert sim_l == 0.0, "FATAL LIN SCORE"

    # 
    go_top_cc = NS2GO['CC']
    sim_r = resnik_sim(go_top_cc, go_top_cc, godag, termcounts)
    assert sim_r == 0.0
    sim_l = lin_sim(go_top_cc, go_top_cc, godag, termcounts)
    assert sim_l == 1.0
def prt_info(termcounts, go_cnt, max_val):
    """Print the information content of a frequently used GO ID."""
    go_id, cnt = get_goid(go_cnt, max_val)
    infocontent = get_info_content(go_id, termcounts)
    msg = 'Information content ({GO} {CNT:7,}) = {INFO:8.6f} {NAME}'
    print(
        msg.format(GO=go_id,
                   CNT=cnt,
                   INFO=infocontent,
                   NAME=termcounts.go2obj[go_id].name))
Esempio n. 3
0
def resnik_sim_hdf5(go_id1, go_id2, godag, termcounts, hdf5):
    '''
        Computes Resnik's similarity measure.
    '''
    try:
        msca_goid = deepest_common_ancestor_hdf5(
            [goterm2id(go_id1), goterm2id(go_id2)], godag, hdf5)
        score = semantic.get_info_content(msca_goid, termcounts)
    except:
        score = -1
    return score
Esempio n. 4
0
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    goids = [
        "GO:0140101",
        "GO:0140097",
        "GO:0140096",
        "GO:0140098",
        "GO:0015318",
        "GO:0140110",
    ]
    # Get all the annotations from arabidopsis.
    associations = [
        ('human', 'goa_human.gaf'),
        ('yeast', 'gene_association.sgd'),
    ]

    cwd = os.getcwd()  # current working directory
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"),
                      loading_bar=None)
    for species, assc_name in associations:  # Limit test numbers for speed
        print()
        # Get all the annotations for the current species
        assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name),
                                  godag,
                                  prt=None)
        # Calculate the information content of the single term, GO:0048364
        termcounts = TermCounts(godag, assc_gene2gos)

        # Print information values for each GO term
        for goid in sorted(goids):
            infocontent = get_info_content(goid, termcounts)
            print(
                '{SPECIES} Information content {INFO:8.6f} {GO} {NAME}'.format(
                    SPECIES=species,
                    GO=goid,
                    INFO=infocontent,
                    NAME=godag[goid].name))

        # Print semantic similarities between each pair of GO terms
        print("GO #1      GO #2      Resnik Lin")
        print("---------- ---------- ------ -------")
        for go_a, go_b in itertools.combinations(sorted(goids), 2):
            # Resnik's similarity measure is defined as the information content of the most
            # informative common ancestor. That is, the most specific common parent-term in the GO.
            sim_r = resnik_sim(go_a, go_b, godag, termcounts)
            # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
            sim_l = lin_sim(go_a, go_b, godag, termcounts)
            print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format(GO1=go_a,
                                                                GO2=go_b,
                                                                RESNIK=sim_r,
                                                                LIN=sim_l))
            assert sim_r, "FATAL RESNIK SCORE"
            assert sim_l, "FATAL LIN SCORE"
Esempio n. 5
0
def resnik_sim_pandas(tup, df, termcounts):
    '''
        Computes Resnik's similarity measure.
    '''
    go_id1, go_id2 = tup
    #print(df.head())
    if go_id1 == go_id2:
        return semantic.get_info_content(go_id1, termcounts)

    elif go_id2 in df.index and go_id1 in df.index:

        ancestors = df.loc[str(go_id2)].parents
        ancestors += df.loc[str(go_id1)].parents
        terms = df.loc[ancestors]
        ancestors_set = terms.parents.tolist()
        intersection = set(ancestors_set[0]).intersection(*ancestors_set[1:])
        common_ancestors = df.loc[list(intersection)]
        common_ancestors = common_ancestors.sort_values('depth',
                                                        ascending=False)
        msca_goid = common_ancestors.index.tolist()[0]
        return semantic.get_info_content(msca_goid, termcounts)

    else:
        return -1
Esempio n. 6
0
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"),
                      loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(
        os.path.join(os.getcwd(), 'gene_association.tair'), godag)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))
    assert sim_r, "FATAL RESNIK SCORE"

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
    assert sim_l, "FATAL LIN SCORE"
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    goids = [
        "GO:0140101",
        "GO:0140097",
        "GO:0140096",
        "GO:0140098",
        "GO:0015318",
        "GO:0140110",
    ]
    # Get all the annotations from arabidopsis.
    associations = [
        ('human', 'goa_human.gaf'),
        ('yeast', 'sgd.gaf'),
    ]


    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    for species, assc_name in associations:  # Limit test numbers for speed
        print()
        # Get all the annotations for the current species
        fin_assc = os.path.join(REPO, assc_name)
        assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None)
        # Calculate the information content of the single term, GO:0048364
        termcounts = TermCounts(godag, assc_gene2gos)

        # Print information values for each GO term
        for goid in sorted(goids):
            infocontent = get_info_content(goid, termcounts)
            term = godag[goid]
            print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format(
                SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name))

        # Print semantic similarities between each pair of GO terms
        print("GO #1      GO #2      Resnik Lin")
        print("---------- ---------- ------ -------")
        for go_a, go_b in itertools.combinations(sorted(goids), 2):
            # Resnik's similarity measure is defined as the information content of the most
            # informative common ancestor. That is, the most specific common parent-term in the GO.
            sim_r = resnik_sim(go_a, go_b, godag, termcounts)
            # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
            sim_l = lin_sim(go_a, go_b, godag, termcounts)
            print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format(
                GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l))
            assert sim_r, "FATAL RESNIK SCORE"
            assert sim_l, "FATAL LIN SCORE"
Esempio n. 8
0
def test_semantic_i88():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    goids = set(go for go, o in godag.items() if go == o.id)
    goids = set(godag.keys())
    # Get all the annotations from arabidopsis.
    fin_gaf = os.path.join(REPO, "tair.gaf")
    # dnld_assc includes read_gaf
    associations = dnld_assc(fin_gaf, godag, prt=None)

    # First get the counts and information content for each GO term.
    termcounts = TermCounts(godag, associations)
    gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    go_root = deepest_common_ancestor([go_id3, go_id4], godag)
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    gosubdag.prt_goids([go_root, go_id3, go_id4])

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag)


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))
    assert sim_r, "FATAL RESNIK SCORE"

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    assert sim_l, "FATAL LIN SCORE"
Esempio n. 10
0
def test_semantic_i88():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    goids = set(go for go, o in godag.items() if go == o.id)
    goids = set(godag.keys())
    # Get all the annotations from arabidopsis.
    fin_gaf = os.path.join(REPO, "tair.gaf")
    # dnld_assc includes read_gaf
    associations = dnld_assc(fin_gaf, godag, prt=None)

    # First get the counts and information content for each GO term.
    termcounts = TermCounts(godag, associations)
    gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    go_root = deepest_common_ancestor([go_id3, go_id4], godag)
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    gosubdag.prt_goids([go_root, go_id3, go_id4])

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_l))
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    # Get all the annotations from arabidopsis.
    associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz")


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
Esempio n. 12
0
def prt_info(termcounts, go_cnt, max_val):
    """Print the information content of a frequently used GO ID."""
    go_id, cnt = get_goid(go_cnt, max_val)
    infocontent = get_info_content(go_id, termcounts)
    msg = 'Information content ({GO} {CNT:7,}) = {INFO:8.6f} {NAME}'
    print(msg.format(GO=go_id, CNT=cnt, INFO=infocontent, NAME=termcounts.go2obj[go_id].name))