Esempio n. 1
0
def test_update_association():
    """Compare new propagate cnts function with original function. Test assc results is same."""

    print('\n1) READ GODAG:')
    assc_name = "goa_human.gaf" # gene_association.fb gene_association.mgi
    obo = os.path.join(REPO, "go-basic.obo")
    tic = timeit.default_timer()
    godag = get_godag(obo)
    tic = prt_hms(tic, "Created two GODags: One for original and one for new propagate counts")

    print('\n2) READ ANNOTATIONS:')
    assc_orig = dnld_assc(os.path.join(REPO, assc_name), godag)
    tic = prt_hms(tic, "Associations Read")
    objanno = get_objanno(os.path.join(REPO, assc_name), 'gaf', godag=godag)
    tic = prt_hms(tic, "Associations Read")

    print('\n3) MAKE COPIES OF ASSOCIATIONS:')
    assc1 = {g:set(gos) for g, gos in assc_orig.items()}
    assc2 = {g:set(gos) for g, gos in assc_orig.items()}
    tic = prt_hms(tic, "Associations Copied: One for original and one for new")

    print('\n4) UPDATE ASSOCIATIONS (PROPAGATE COUNTS):')
    godag.update_association(assc1)
    tic = prt_hms(tic, "ORIG: godag.update_association(assc)")
    update_association(assc2, godag)
    tic = prt_hms(tic, "NEW SA:    update_association(go2obj, assc_orig)")
    assc3 = objanno.get_id2gos(namespace='BP', propagate_counts=True)
    tic = prt_hms(tic, "NEW BASE:  update_association(go2obj, assc_orig)")

    print('\n5) RUN CHECKS')
    _chk_assc(assc1, assc2)
    _chk_assc(assc1, assc3)
    _chk_godag(godag, obo)
Esempio n. 2
0
 def __init__(self):
     self.prt = sys.stdout
     _fin_assc = os.path.join(REPO, "goa_human.gaf")
     self.gene2gos_orig = dnld_assc(_fin_assc, go2obj=None, prt=self.prt)
     self.go2genes_orig = get_b2aset(self.gene2gos_orig)
     _num_genes = [len(gs) for gs in self.go2genes_orig.values()]
     self.min_genes = min(_num_genes)
     self.max_genes = max(_num_genes)
     assert self.gene2gos_orig == get_b2aset(self.go2genes_orig)
Esempio n. 3
0
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    goids = [
        "GO:0140101",
        "GO:0140097",
        "GO:0140096",
        "GO:0140098",
        "GO:0015318",
        "GO:0140110",
    ]
    # Get all the annotations from arabidopsis.
    associations = [
        ('human', 'goa_human.gaf'),
        ('yeast', 'gene_association.sgd'),
    ]

    cwd = os.getcwd()  # current working directory
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"),
                      loading_bar=None)
    for species, assc_name in associations:  # Limit test numbers for speed
        print()
        # Get all the annotations for the current species
        assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name),
                                  godag,
                                  prt=None)
        # Calculate the information content of the single term, GO:0048364
        termcounts = TermCounts(godag, assc_gene2gos)

        # Print information values for each GO term
        for goid in sorted(goids):
            infocontent = get_info_content(goid, termcounts)
            print(
                '{SPECIES} Information content {INFO:8.6f} {GO} {NAME}'.format(
                    SPECIES=species,
                    GO=goid,
                    INFO=infocontent,
                    NAME=godag[goid].name))

        # Print semantic similarities between each pair of GO terms
        print("GO #1      GO #2      Resnik Lin")
        print("---------- ---------- ------ -------")
        for go_a, go_b in itertools.combinations(sorted(goids), 2):
            # Resnik's similarity measure is defined as the information content of the most
            # informative common ancestor. That is, the most specific common parent-term in the GO.
            sim_r = resnik_sim(go_a, go_b, godag, termcounts)
            # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
            sim_l = lin_sim(go_a, go_b, godag, termcounts)
            print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format(GO1=go_a,
                                                                GO2=go_b,
                                                                RESNIK=sim_r,
                                                                LIN=sim_l))
            assert sim_r, "FATAL RESNIK SCORE"
            assert sim_l, "FATAL LIN SCORE"
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag)


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    dca = deepest_common_ancestor([go_id3, go_id4], godag)
    assert dca == NS2GO['BP']
    assert sim_r == get_info_content(dca, termcounts)
    assert sim_r == 0.0
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = 0.0 because they are similar through BP top
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    assert sim_l == 0.0, "FATAL LIN SCORE"

    # 
    go_top_cc = NS2GO['CC']
    sim_r = resnik_sim(go_top_cc, go_top_cc, godag, termcounts)
    assert sim_r == 0.0
    sim_l = lin_sim(go_top_cc, go_top_cc, godag, termcounts)
    assert sim_l == 1.0
Esempio n. 5
0
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"),
                      loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(
        os.path.join(os.getcwd(), 'gene_association.tair'), godag)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))
    assert sim_r, "FATAL RESNIK SCORE"

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
    assert sim_l, "FATAL LIN SCORE"
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    goids = [
        "GO:0140101",
        "GO:0140097",
        "GO:0140096",
        "GO:0140098",
        "GO:0015318",
        "GO:0140110",
    ]
    # Get all the annotations from arabidopsis.
    associations = [
        ('human', 'goa_human.gaf'),
        ('yeast', 'sgd.gaf'),
    ]


    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    for species, assc_name in associations:  # Limit test numbers for speed
        print()
        # Get all the annotations for the current species
        fin_assc = os.path.join(REPO, assc_name)
        assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None)
        # Calculate the information content of the single term, GO:0048364
        termcounts = TermCounts(godag, assc_gene2gos)

        # Print information values for each GO term
        for goid in sorted(goids):
            infocontent = get_info_content(goid, termcounts)
            term = godag[goid]
            print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format(
                SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name))

        # Print semantic similarities between each pair of GO terms
        print("GO #1      GO #2      Resnik Lin")
        print("---------- ---------- ------ -------")
        for go_a, go_b in itertools.combinations(sorted(goids), 2):
            # Resnik's similarity measure is defined as the information content of the most
            # informative common ancestor. That is, the most specific common parent-term in the GO.
            sim_r = resnik_sim(go_a, go_b, godag, termcounts)
            # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
            sim_l = lin_sim(go_a, go_b, godag, termcounts)
            print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format(
                GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l))
            assert sim_r, "FATAL RESNIK SCORE"
            assert sim_l, "FATAL LIN SCORE"
Esempio n. 7
0
def describe_assc(org, fin_assc, go2obj, obj, prt):
    """Report statistics for a single association."""
    # Assc.       | # Assc| range      | 25th | median | 75th | mean | stddev
    # ------------|-------|------------|------|--------|------|------|-------
    # hsa GO/gene | 19394 | 1 to   212 |    5 |      9 |   17 |   13 |     14
    # hsa gene/GO | 17277 | 1 to 8,897 |    1 |      3 |    8 |   15 |    120
    #
    # mus GO/gene | 19870 | 1 to   261 |    5 |     10 |   18 |   14 |     15
    # mus gene/GO | 17491 | 1 to 7,009 |    1 |      3 |    8 |   16 |    129
    #
    # dme GO/gene | 12551 | 1 to   137 |    2 |      4 |    8 |    6 |      7
    # dme gene/GO |  7878 | 1 to 1,675 |    1 |      3 |    7 |   10 |     41
    gene2gos = dnld_assc(fin_assc, go2obj, prt=None)  # Associations
    go2genes = get_b2aset(gene2gos)
    cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()]
    cnts_genes_p_go = [len(genes) for genes in go2genes.values()]
    obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt)
    obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
Esempio n. 8
0
def test_semantic_i88():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    goids = set(go for go, o in godag.items() if go == o.id)
    goids = set(godag.keys())
    # Get all the annotations from arabidopsis.
    fin_gaf = os.path.join(REPO, "tair.gaf")
    # dnld_assc includes read_gaf
    associations = dnld_assc(fin_gaf, godag, prt=None)

    # First get the counts and information content for each GO term.
    termcounts = TermCounts(godag, associations)
    gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    go_root = deepest_common_ancestor([go_id3, go_id4], godag)
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    gosubdag.prt_goids([go_root, go_id3, go_id4])

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
Esempio n. 9
0
def test_semantic_similarity(usr_assc=None):
    """Computing basic semantic similarities between GO terms."""
    not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'}
    associations = sorted(ASSOCIATIONS.difference(not_these))
    go2obj = get_go2obj()
    # goids = go2obj.keys()
    # http://current.geneontology.org/annotations/
    if usr_assc is not None:
        associations = [usr_assc]
    cwd = os.getcwd()
    not_found = set()
    for assc_name in associations:  # Limit test numbers for speed
        tic = timeit.default_timer()
        # Get all the annotations from arabidopsis.
        assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name),
                                  go2obj,
                                  prt=sys.stdout)
        if not assc_gene2gos:
            not_found.add(assc_name)
            continue

        # Calculate the information content of the single term, GO:0048364
        #       "Information content (GO:0048364) = 7.75481392334

        # First get the counts of each GO term.
        termcounts = TermCounts(go2obj, assc_gene2gos)
        go_cnt = termcounts.gocnts.most_common()
        #print termcounts.gocnts.most_common()

        if go_cnt:
            print("{ASSC}".format(ASSC=assc_name))
            print(sorted(termcounts.aspect_counts.most_common()))
            gocnt_max = go_cnt[0][1]
            prt_info(termcounts, go_cnt, None)
            prt_info(termcounts, go_cnt, gocnt_max / 2.0)
            prt_info(termcounts, go_cnt, gocnt_max / 10.0)
        print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name,
                                            HMS=_hms(TIC),
                                            hms=_hms(tic)))
    print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations)))
    if not_found:
        _prt_not_found(not_found)
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag)


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))
    assert sim_r, "FATAL RESNIK SCORE"

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    assert sim_l, "FATAL LIN SCORE"
Esempio n. 11
0
def describe_assc(org, fin_assc, go2obj, obj, prt):
    """Report statistics for a single association."""
    # Assc.       | # Assc| range      | 25th | median | 75th | mean | stddev
    # ------------|-------|------------|------|--------|------|------|-------
    # hsa GO/gene | 19394 | 1 to   212 |    5 |      9 |   17 |   13 |     14
    # hsa gene/GO | 17277 | 1 to 8,897 |    1 |      3 |    8 |   15 |    120
    #
    # mus GO/gene | 19870 | 1 to   261 |    5 |     10 |   18 |   14 |     15
    # mus gene/GO | 17491 | 1 to 7,009 |    1 |      3 |    8 |   16 |    129
    #
    # dme GO/gene | 12551 | 1 to   137 |    2 |      4 |    8 |    6 |      7
    # dme gene/GO |  7878 | 1 to 1,675 |    1 |      3 |    7 |   10 |     41
    gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations
    go2genes = get_b2aset(gene2gos)
    assert gene2gos
    assert go2genes
    cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()]
    cnts_genes_p_go = [len(genes) for genes in go2genes.values()]
    obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt)
    obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
def _test_path_bp_mf(branch_dist, godag, prt):
    """Test distances between BP branch and MF branch."""
    go_mf = 'GO:0003676'  # level-03 depth-03 nucleic acid binding [molecular_function]
    go_bp = 'GO:0007516'  # level-04 depth-05 hemocyte development [biological_process]
    dst_none = semantic_distance(go_mf, go_bp, godag)
    sim_none = semantic_similarity(go_mf, go_bp, godag)
    assc = dnld_assc("gene_association.tair", godag)
    termcounts = TermCounts(godag, assc)
    fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n'
    sim_r = resnik_sim(go_mf, go_bp, godag, termcounts)
    sim_l = lin_sim(go_mf, go_bp, godag, termcounts)
    if prt is not None:
        prt.write(
            fmt.format(TYPE='semantic distance',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=dst_none))
        prt.write(
            fmt.format(TYPE='semantic similarity',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_none))
        prt.write(
            fmt.format(TYPE='Resnik similarity',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_r))
        prt.write(
            fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l))
    assert dst_none is None
    assert sim_none is None
    assert sim_r is None
    assert sim_l is None
    sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist)
    if prt is not None:
        prt.write(
            fmt.format(TYPE='semantic distance',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_d))
    assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist
Esempio n. 13
0
def test_semantic_i88():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    goids = set(go for go, o in godag.items() if go == o.id)
    goids = set(godag.keys())
    # Get all the annotations from arabidopsis.
    fin_gaf = os.path.join(REPO, "tair.gaf")
    # dnld_assc includes read_gaf
    associations = dnld_assc(fin_gaf, godag, prt=None)

    # First get the counts and information content for each GO term.
    termcounts = TermCounts(godag, associations)
    gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    go_root = deepest_common_ancestor([go_id3, go_id4], godag)
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    gosubdag.prt_goids([go_root, go_id3, go_id4])

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_l))
Esempio n. 14
0
def test_semantic_similarity(usr_assc=None):
    """Computing basic semantic similarities between GO terms."""
    not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'}
    associations = sorted(ASSOCIATIONS.difference(not_these))
    go2obj = get_go2obj()
    # goids = go2obj.keys()
    # http://current.geneontology.org/annotations/
    if usr_assc is not None:
        associations = [usr_assc]
    cwd = os.getcwd()
    not_found = set()
    for assc_name in associations:  # Limit test numbers for speed
        tic = timeit.default_timer()
        # Get all the annotations from arabidopsis.
        assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), go2obj, prt=sys.stdout)
        if not assc_gene2gos:
            not_found.add(assc_name)
            continue

        # Calculate the information content of the single term, GO:0048364
        #       "Information content (GO:0048364) = 7.75481392334

        # First get the counts of each GO term.
        termcounts = TermCounts(go2obj, assc_gene2gos)
        go_cnt = termcounts.gocnts.most_common()
        #print termcounts.gocnts.most_common()

        if go_cnt:
            print("{ASSC}".format(ASSC=assc_name))
            print(sorted(termcounts.aspect_counts.most_common()))
            gocnt_max = go_cnt[0][1]
            prt_info(termcounts, go_cnt, None)
            prt_info(termcounts, go_cnt, gocnt_max/2.0)
            prt_info(termcounts, go_cnt, gocnt_max/10.0)
        print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name, HMS=_hms(TIC), hms=_hms(tic)))
    print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations)))
    if not_found:
        _prt_not_found(not_found)
Esempio n. 15
0
def test_semantic_similarity(usr_assc=None):
    """Computing basic semantic similarities between GO terms."""
    go2obj = get_go2obj()
    # goids = go2obj.keys()
    associations = [
        'gene_association.GeneDB_Lmajor',
        'gene_association.GeneDB_Pfalciparum',
        'gene_association.GeneDB_Tbrucei',
        'gene_association.GeneDB_tsetse',
        'gene_association.PAMGO_Atumefaciens',
        'gene_association.PAMGO_Ddadantii',
        #'gene_association.PAMGO_Mgrisea', # TBD Resolve DB_Name containing '|'
        'gene_association.PAMGO_Oomycetes',
        'gene_association.aspgd',
        'gene_association.cgd',
        'gene_association.dictyBase',
        'gene_association.ecocyc',
        'gene_association.fb',
        'gene_association.gonuts',
        #'gene_association.gramene_oryza', # DB_Name
        'gene_association.jcvi',
        'gene_association.mgi',
        'gene_association.pombase',
        'gene_association.pseudocap',
        'gene_association.reactome',
        'gene_association.rgd',
        'gene_association.sgd',
        'gene_association.sgn',
        'gene_association.tair',
        'gene_association.wb',
        'gene_association.zfin',
        'goa_chicken.gaf',
        'goa_chicken_complex.gaf',
        'goa_chicken_isoform.gaf',
        'goa_chicken_rna.gaf',
        'goa_cow.gaf',
        'goa_cow_complex.gaf',
        'goa_cow_isoform.gaf',
        'goa_cow_rna.gaf',
        'goa_dog.gaf',
        'goa_dog_complex.gaf',
        'goa_dog_isoform.gaf',
        'goa_dog_rna.gaf',
        'goa_human.gaf',
        'goa_human_complex.gaf',
        'goa_human_isoform.gaf',
        'goa_human_rna.gaf',
        'goa_pdb.gaf',
        'goa_pig.gaf',
        'goa_pig_complex.gaf',
        'goa_pig_isoform.gaf',
        'goa_pig_rna.gaf',
        #'goa_uniprot_all.gaf',
        #'goa_uniprot_all_noiea.gaf',
    ]
    if usr_assc is not None:
        associations = [usr_assc]
    cwd = os.getcwd()
    for assc_name in associations:  # Limit test numbers for speed
        # Get all the annotations from arabidopsis.
        assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name),
                                  go2obj,
                                  prt=sys.stdout)

        # Calculate the information content of the single term, GO:0048364
        #       "Information content (GO:0048364) = 7.75481392334

        # First get the counts of each GO term.
        termcounts = TermCounts(go2obj, assc_gene2gos)
        go_cnt = termcounts.gocnts.most_common()
        #print termcounts.gocnts.most_common()

        if go_cnt:
            print("\n{ASSC}".format(ASSC=assc_name))
            print(sorted(termcounts.aspect_counts.most_common()))
            gocnt_max = go_cnt[0][1]
            prt_info(termcounts, go_cnt, None)
            prt_info(termcounts, go_cnt, gocnt_max / 2.0)
            prt_info(termcounts, go_cnt, gocnt_max / 10.0)
Esempio n. 16
0
mean = []
num = []
df = pd.read_csv(
    '/sf/smpdata1/pronozinau/Blast_test/odb10v0_gene_xrefs_onlyGO.tab',
    sep='\t',
    header=None)
df.columns = ['ort', 'GO', '3']
zipbO = zip(df['ort'].to_list(), df['GO'].to_list())
my_dict = defaultdict(list)
for k, v in zipbO:
    my_dict[k].append(v)
#my_dict = read_associations('/sf/smpdata1/pronozinau/GO_slim/GO_slim.csv', 'id2gos')

godag = GODag("go-basic.obo")
fin_gaf = os.path.join(os.getcwd(), "tair.gaf")
associations = dnld_assc(fin_gaf, godag)
termcounts = TermCounts(godag, associations)
godag = GODag("go-basic.obo")

#def find_csv_filenames( path_to_dir, suffix=".csv" ):
#    filenames = listdir(path_to_dir)
#    return [ filename for filename in filenames if filename.endswith( suffix ) ]
#blat = find_csv_filenames("/storage/pronozinau/ALL_base_OtrhoDB/metout/group3_bla/", "csv")
blat = pd.read_csv('/storage/pronozinau/OrthoDB/mono_sp.csv', sep=',')
for w in blat['0']:
    try:
        clustal = pd.read_csv('/storage/pronozinau/OrthoDB/clustalw/group_3/' +
                              w + '.csv',
                              sep='\t',
                              header=None)
        blast = pd.read_csv(