def test_semantic_similarity(usr_assc=None):
    """Computing basic semantic similarities between GO terms."""
    not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'}
    associations = sorted(ASSOCIATIONS.difference(not_these))
    go2obj = get_go2obj()
    # goids = go2obj.keys()
    # http://current.geneontology.org/annotations/
    if usr_assc is not None:
        associations = [usr_assc]
    not_found = set()
    errs = []
    for assc_name in associations:  # Limit test numbers for speed
        tic = timeit.default_timer()
        # Get all the annotations from arabidopsis.
        fin_gaf = os.path.join(REPO, assc_name)
        if not os.path.exists(fin_gaf):
            dnld_annotation(fin_gaf)
        annoobj = GafReader(fin_gaf)
        #### for nspc in ['BP', 'MF', 'CC']:
        assc_gene2gos = annoobj.get_id2gos('all')
        if not assc_gene2gos:
            not_found.add(assc_name)
            continue

        # Calculate the information content of the single term, GO:0048364
        #       "Information content (GO:0048364) = 7.75481392334

        # Initialize the counts of each GO term.
        tcntobj = TermCounts(go2obj, assc_gene2gos)
        go_cnt = tcntobj.gocnts.most_common()

        #print tcntobj.gocnts.most_common()

        if go_cnt:
            print("{ASSC}".format(ASSC=assc_name))
            print(tcntobj.aspect_counts)
            gocnt_max = go_cnt[0][1]
            prt_info(tcntobj, go_cnt, None)
            prt_info(tcntobj, go_cnt, gocnt_max / 2.0)
            prt_info(tcntobj, go_cnt, gocnt_max / 10.0)
        print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name,
                                            HMS=_hms(TIC),
                                            hms=_hms(tic)))
    print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations)))
    if not_found:
        _prt_not_found(not_found)
    if errs:
        fout_err = 'namespace_errors.txt'
        with open(fout_err, 'w') as prt:
            for err in errs:
                prt.write(err)
            print('  {N} ERRORS WROTE: {TXT}'.format(N=len(errs),
                                                     TXT=fout_err))
Beispiel #2
0
def test_i148b_semsim_lin(do_plt=False):
    """Test for issue 148, Lin Similarity if a term has no annotations"""
    fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf')
    godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo"))
    annoobj = GafReader(fin_gaf, godag=godag)

    associations = annoobj.get_id2gos('CC')
    tcntobj = TermCounts(godag, associations)

    if do_plt:
        _do_plt(tcntobj, godag)

    goids = list(godag.keys())

    ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0))
    ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0))
    ##return

    # Calculate Resnik values
    p2r = {
        frozenset([a, b]): resnik_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Resnik', goids, p2r)

    # Calculate Lin values
    p2l = {
        frozenset([a, b]): lin_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Lin', goids, p2l)
    _chk_lin(p2l)
    return

    # Calculate Resnik values
    p2r = {
        frozenset([a, b]): resnik_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Resnik', goids, p2r)

    # Calculate Lin values
    p2l = {
        frozenset([a, b]): lin_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Lin', goids, p2l)
    _chk_lin(p2l)