def test_semantic_similarity(usr_assc=None): """Computing basic semantic similarities between GO terms.""" not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'} associations = sorted(ASSOCIATIONS.difference(not_these)) go2obj = get_go2obj() # goids = go2obj.keys() # http://current.geneontology.org/annotations/ if usr_assc is not None: associations = [usr_assc] not_found = set() errs = [] for assc_name in associations: # Limit test numbers for speed tic = timeit.default_timer() # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, assc_name) if not os.path.exists(fin_gaf): dnld_annotation(fin_gaf) annoobj = GafReader(fin_gaf) #### for nspc in ['BP', 'MF', 'CC']: assc_gene2gos = annoobj.get_id2gos('all') if not assc_gene2gos: not_found.add(assc_name) continue # Calculate the information content of the single term, GO:0048364 # "Information content (GO:0048364) = 7.75481392334 # Initialize the counts of each GO term. tcntobj = TermCounts(go2obj, assc_gene2gos) go_cnt = tcntobj.gocnts.most_common() #print tcntobj.gocnts.most_common() if go_cnt: print("{ASSC}".format(ASSC=assc_name)) print(tcntobj.aspect_counts) gocnt_max = go_cnt[0][1] prt_info(tcntobj, go_cnt, None) prt_info(tcntobj, go_cnt, gocnt_max / 2.0) prt_info(tcntobj, go_cnt, gocnt_max / 10.0) print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name, HMS=_hms(TIC), hms=_hms(tic))) print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations))) if not_found: _prt_not_found(not_found) if errs: fout_err = 'namespace_errors.txt' with open(fout_err, 'w') as prt: for err in errs: prt.write(err) print(' {N} ERRORS WROTE: {TXT}'.format(N=len(errs), TXT=fout_err))
def test_i148b_semsim_lin(do_plt=False): """Test for issue 148, Lin Similarity if a term has no annotations""" fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf') godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo")) annoobj = GafReader(fin_gaf, godag=godag) associations = annoobj.get_id2gos('CC') tcntobj = TermCounts(godag, associations) if do_plt: _do_plt(tcntobj, godag) goids = list(godag.keys()) ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0)) ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0)) ##return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l) return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l)