Esempio n. 1
0
def test_semantic_similarity():
    """Test faster version of sematic similarity"""
    godag_r0 = get_godag('go-basic.obo')
    ## godag_r1 = get_godag('go-basic.obo', optional_attrs=['relationship'])
    annoobj = GpadReader(get_anno_fullname('goa_human.gpad'), godag=godag_r0)
    ns2assoc = annoobj.get_ns2assc()
    assoc = annoobj.get_id2gos('all')

    # Get TermCounts for each namespace and for all namespaces
    ns2tcnt = {
        ns: TermCounts(godag_r0, ns2assoc[ns])
        for ns in ['BP', 'MF', 'CC']
    }
    tic = timeit.default_timer()
    tcntobj = TermCounts(godag_r0, assoc)
    prt_hms(
        tic, 'CUR ACTUAL   {N:,} TermCounts initialized'.format(
            N=len(tcntobj.gocnts)))
    # Compare various TermCount counts
    for nspc in ['BP', 'MF', 'CC']:
        for goid, cnt in ns2tcnt[nspc].gocnts.items():
            assert tcntobj.gocnts[goid] == cnt

    # Compare old and new count
    tic = timeit.default_timer()
    gocnts_old = _old_init_count_terms(godag_r0, assoc.values())
    assert gocnts_old
    prt_hms(
        tic,
        'OLD EXPECTED {N:,} TermCounts initialized'.format(N=len(gocnts_old)))
    for goid, cnt_old in gocnts_old.items():
        assert cnt_old == tcntobj.gocnts[goid]
Esempio n. 2
0
def intialize_term_counts():
    go_freq_dict = dict()
    go_dag = GODag(os.path.join(DATA_DIR, "go-basic.obo"))

    associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH,
                                 godag=go_dag).get_id2gos('all')
    term_counts = TermCounts(go_dag, associations)
    for i in go_dag.values():
        go_freq_dict[i.id] = term_counts.get_count(i.id)
    # write frequency dict to JSON file
    with open(JSON_INDEXED_FILE_PATH, 'w') as json_file:
        json.dump(go_freq_dict, json_file)
Esempio n. 3
0
def _precompute_term_frequencies():
    print("Start precomputations of term frequencies...")
    go_freq_dict = dict()
    go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w'))

    associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH,
                                 godag=go_dag).get_id2gos('all')
    term_counts = TermCounts(go_dag, associations)

    for i in go_dag.values():
        go_freq_dict[i.id] = term_counts.get_count(i.id)
        for alt_id in i.alt_ids:
            go_freq_dict[alt_id] = term_counts.get_count(i.id)
    # write frequency dict to JSON file
    with open(FREQUENCY_COUNTS_FILE_PATH, 'w') as json_file:
        json.dump(go_freq_dict, json_file)
Esempio n. 4
0
def createMatrix(goTerms, background, method):
    """ Return a numerical matrix

    Keyword arguments:
    goTerms -- list of go terms
    background -- flattened background: lists of genes and GO Terms
    method -- semantic similarity method, either "Lin", "Resnik", "Wang" or "Edge-based"
    Creates semantic similarity matrix
    """
    termcounts = TermCounts(godag, background)
    matrix = list()
    wang_r1 = None
    if method == "Wang":
        wang_r1 = SsWang(goTerms, godag)
    # only create half of matrix, fill rest with -1
    i = 0
    for termA in goTerms:
        j = 0
        row = list()
        for termB in goTerms:
            sim = -1
            if i < j:
                if method == "Lin":
                    sim = lin_sim(termA, termB, godag, termcounts)
                elif method == "Resnik":
                    sim = resnik_sim(termA, termB, godag, termcounts)
                elif method == "Wang":
                    sim = wang_r1.get_sim(termA, termB)
                else:
                    sim = semantic_similarity(termA, termB, godag)
            row.append(sim)
            j += 1
        matrix.append(row)
        i += 1
    return matrix
Esempio n. 5
0
def test_write_hier_bp_mf_cc():
    """Test that write hierarchy writes all: BP, MF, CC"""
    fin_anno = os.path.join(REPO, 'gene2go')
    fin_dag = os.path.join(REPO, "go-basic.obo")
    _dnld_anno(fin_anno)
    #godag = get_godag(os.path.join(REPO, 'go-basic.obo'), loading_bar=None)

    print('\nTEST STORING ONLY ONE SPECIES')
    #### obj = Gene2GoReader(fin_anno)
    godag = get_godag(fin_dag)
    gene2gos = read_annotations(namespace='ALL')
    tcntobj = TermCounts(godag, gene2gos) if gene2gos else None
    gosubdag = GoSubDag(godag.keys(),
                        godag,
                        relationships=False,
                        tcntobj=tcntobj,
                        children=True,
                        prt=sys.stdout)
    objwr = WrHierGO(gosubdag)

    # 2020 11:
    #     594,748 GO lines under GO:0008150
    #      23,199 GO lines under GO:0003674
    #       6,259 GO lines under GO:0005575
    #     624,206 items WROTE: tmp_test_wr_hier_BP_MF_CC.txt
    assert len(_wr_hier(['BP', 'MF', 'CC'], gosubdag.go2nt, objwr)) > 600000
    assert len(_wr_hier([
        'BP',
    ], gosubdag.go2nt, objwr)) > 500000
    assert len(_wr_hier([
        'MF',
    ], gosubdag.go2nt, objwr)) > 20000
    assert len(_wr_hier([
        'CC',
    ], gosubdag.go2nt, objwr)) > 5000
Esempio n. 6
0
def get_tcntobj(go2obj, **kws):
    """Return a TermCounts object if the user provides an annotation file, otherwise None."""
    # kws: gpad gaf gene2go id2gos
    objanno = get_objanno_g_kws(**kws)
    if objanno:
        return TermCounts(go2obj, objanno.get_id2gos_nss())
    return None
def test_tcntobj_relationships(do_plt=False):
    """Test loading of relationships, like part_of, into TermCounts"""
    # Filenames
    fin_obo = os.path.join(REPO, "tests/data/yangRWC/fig2a.obo")
    fin_anno = os.path.join(REPO, "tests/data/yangRWC/fig2a.anno")
    fout_png_r0 = os.path.join(REPO, 'yang_fig2a_r0.png')
    fout_png_r1 = os.path.join(REPO, 'yang_fig2a_r1.png')
    relationships = {
        'part_of',
    }

    # Load ontologies
    go2obj = GODag(fin_obo, optional_attrs=['relationship'])

    # Load annotations
    assoc = IdToGosReader(fin_anno, godag=go2obj).get_id2gos('CC')

    # Count genes annotated to GO terms w and wo/relationships
    tcntobj_r0 = TermCounts(go2obj, assoc)
    # relationship: G (GO:0000007) is part_of F (GO:0000006)
    tcntobj_r1 = TermCounts(go2obj, assoc, relationships)

    # Check results
    # Adding relationships does not change the total count of genes:
    assert tcntobj_r0.gocnts['GO:0005575'] == tcntobj_r1.gocnts['GO:0005575']
    # Counts without relationships:
    assert tcntobj_r0.gocnts['GO:0000002'] == 40  # GO Term B
    assert tcntobj_r0.gocnts['GO:0000006'] == 10  # GO Term F
    # Counts with relationships: F counts G's 30 genes, so does B
    assert tcntobj_r1.gocnts['GO:0000002'] == 70  # GO Term B
    assert tcntobj_r1.gocnts['GO:0000006'] == 40  # GO Term F

    # Optionally visualize the difference between term counts w and wo/relationships
    if do_plt:
        go2txt_r0 = {
            nt.GO: 'tcnt={}'.format(nt.tcnt)
            for nt in tcntobj_r0.gosubdag.go2nt.values()
        }
        GoSubDagPlot(tcntobj_r0.gosubdag,
                     go2txt=go2txt_r0).plt_dag(fout_png_r0)
        go2txt_r1 = {
            nt.GO: 'tcnt={}'.format(nt.tcnt)
            for nt in tcntobj_r1.gosubdag.go2nt.values()
        }
        GoSubDagPlot(tcntobj_r1.gosubdag,
                     go2txt=go2txt_r1).plt_dag(fout_png_r1)
def test_tcntobj_relationships(prt=sys.stdout):
    """Test loading of relationships, like part_of, into TermCounts"""
    fin_obo = os.path.join(REPO, "go-basic.obo")
    fin_anno = os.path.join(REPO, 'goa_human.gpad')

    download_go_basic_obo(fin_obo, prt, loading_bar=None)
    dnld_annotation(fin_anno)

    # Load ontologies
    go2obj_r0 = GODag(fin_obo)
    go2obj_r1 = GODag(fin_obo, optional_attrs=['relationship'])

    # Load annotations
    annoobj = GpadReader(fin_anno, godag=go2obj_r0)

    # Create TermCounts objects
    ns2tcntobj_r0 = {ns:TermCounts(go2obj_r0, annoobj.get_id2gos(ns)) for ns in NSS}
    ns2tcntobj_r1 = {ns:TermCounts(go2obj_r1, annoobj.get_id2gos(ns), RELS) for ns in NSS}
    _chk_pass_fail(ns2tcntobj_r0, ns2tcntobj_r1)
Esempio n. 9
0
def test_semantic_similarity():
    """Test faster version of sematic similarity"""
    godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo'))
    name2go = {o.name: o.item_id for o in godag.values()}
    assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1a.anno'), godag, name2go)
    tcntobj = TermCounts(godag, assoc)
    assert tcntobj.gocnts[name2go['I']] == 50
    assert tcntobj.gocnts[name2go['L']] == 50
    assert tcntobj.gocnts[name2go['M']] == 50
    assert tcntobj.gocnts[name2go['N']] == 50
Esempio n. 10
0
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    goids = [
        "GO:0140101",
        "GO:0140097",
        "GO:0140096",
        "GO:0140098",
        "GO:0015318",
        "GO:0140110",
    ]
    # Get all the annotations from arabidopsis.
    associations = [
        ('human', 'goa_human.gaf'),
        ('yeast', 'gene_association.sgd'),
    ]

    cwd = os.getcwd()  # current working directory
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"),
                      loading_bar=None)
    for species, assc_name in associations:  # Limit test numbers for speed
        print()
        # Get all the annotations for the current species
        assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name),
                                  godag,
                                  prt=None)
        # Calculate the information content of the single term, GO:0048364
        termcounts = TermCounts(godag, assc_gene2gos)

        # Print information values for each GO term
        for goid in sorted(goids):
            infocontent = get_info_content(goid, termcounts)
            print(
                '{SPECIES} Information content {INFO:8.6f} {GO} {NAME}'.format(
                    SPECIES=species,
                    GO=goid,
                    INFO=infocontent,
                    NAME=godag[goid].name))

        # Print semantic similarities between each pair of GO terms
        print("GO #1      GO #2      Resnik Lin")
        print("---------- ---------- ------ -------")
        for go_a, go_b in itertools.combinations(sorted(goids), 2):
            # Resnik's similarity measure is defined as the information content of the most
            # informative common ancestor. That is, the most specific common parent-term in the GO.
            sim_r = resnik_sim(go_a, go_b, godag, termcounts)
            # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
            sim_l = lin_sim(go_a, go_b, godag, termcounts)
            print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format(GO1=go_a,
                                                                GO2=go_b,
                                                                RESNIK=sim_r,
                                                                LIN=sim_l))
            assert sim_r, "FATAL RESNIK SCORE"
            assert sim_l, "FATAL LIN SCORE"
Esempio n. 11
0
 def __init__(self, gene2gos, objcli, godag_version):
     # _goids = set(o.id for o in godag.values() if not o.children)
     _goids = set(r.GO for r in objcli.results_all)
     _tobj = TermCounts(objcli.godag, gene2gos)
     # pylint: disable=line-too-long
     self.gosubdag = GoSubDag(_goids, objcli.godag, relationships=True, tcntobj=_tobj, prt=sys.stdout)
     self.grprdflt = GrouperDflts(self.gosubdag, objcli.args.goslim)
     self.hdrobj = HdrgosSections(self.grprdflt.gosubdag, self.grprdflt.hdrgos_dflt, objcli.sections)
     self.pval_fld = objcli.get_pval_field()  # primary pvalue of interest
     self.ver_list = [godag_version,
                      self.grprdflt.ver_goslims,
                      "Sections: {S}".format(S=objcli.args.sections)]
Esempio n. 12
0
def test_semantic_similarity(usr_assc=None):
    """Computing basic semantic similarities between GO terms."""
    not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'}
    associations = sorted(ASSOCIATIONS.difference(not_these))
    go2obj = get_go2obj()
    # goids = go2obj.keys()
    # http://current.geneontology.org/annotations/
    if usr_assc is not None:
        associations = [usr_assc]
    not_found = set()
    errs = []
    for assc_name in associations:  # Limit test numbers for speed
        tic = timeit.default_timer()
        # Get all the annotations from arabidopsis.
        fin_gaf = os.path.join(REPO, assc_name)
        if not os.path.exists(fin_gaf):
            dnld_annotation(fin_gaf)
        annoobj = GafReader(fin_gaf)
        #### for nspc in ['BP', 'MF', 'CC']:
        assc_gene2gos = annoobj.get_id2gos('all')
        if not assc_gene2gos:
            not_found.add(assc_name)
            continue

        # Calculate the information content of the single term, GO:0048364
        #       "Information content (GO:0048364) = 7.75481392334

        # Initialize the counts of each GO term.
        tcntobj = TermCounts(go2obj, assc_gene2gos)
        go_cnt = tcntobj.gocnts.most_common()

        #print tcntobj.gocnts.most_common()

        if go_cnt:
            print("{ASSC}".format(ASSC=assc_name))
            print(tcntobj.aspect_counts)
            gocnt_max = go_cnt[0][1]
            prt_info(tcntobj, go_cnt, None)
            prt_info(tcntobj, go_cnt, gocnt_max / 2.0)
            prt_info(tcntobj, go_cnt, gocnt_max / 10.0)
        print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name,
                                            HMS=_hms(TIC),
                                            hms=_hms(tic)))
    print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations)))
    if not_found:
        _prt_not_found(not_found)
    if errs:
        fout_err = 'namespace_errors.txt'
        with open(fout_err, 'w') as prt:
            for err in errs:
                prt.write(err)
            print('  {N} ERRORS WROTE: {TXT}'.format(N=len(errs),
                                                     TXT=fout_err))
Esempio n. 13
0
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag)


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    dca = deepest_common_ancestor([go_id3, go_id4], godag)
    assert dca == NS2GO['BP']
    assert sim_r == get_info_content(dca, termcounts)
    assert sim_r == 0.0
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = 0.0 because they are similar through BP top
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    assert sim_l == 0.0, "FATAL LIN SCORE"

    # 
    go_top_cc = NS2GO['CC']
    sim_r = resnik_sim(go_top_cc, go_top_cc, godag, termcounts)
    assert sim_r == 0.0
    sim_l = lin_sim(go_top_cc, go_top_cc, godag, termcounts)
    assert sim_l == 1.0
Esempio n. 14
0
def _run_full(fin_gpad, godag):
    """Load all annoations (BP, MF, CC)"""
    annoobj = GpadReader(fin_gpad, godag=godag)
    id2gos = annoobj.get_id2gos('all')
    tcntobj = TermCounts(godag, id2gos)
    top_cnt_all = {}
    for nspc in ['BP', 'MF', 'CC']:
        top_ns = NS2GO[nspc]
        namespace = NS2NAMESPACE[nspc]
        top_cnt = tcntobj.gocnts[top_ns]
        top_cnt_all[nspc] = top_cnt
        assert top_cnt == tcntobj.aspect_counts[namespace]
    return top_cnt_all
Esempio n. 15
0
def test_semantic_i88():
    """Full set of annotations can be used to set TermCounts. No need to break it up."""
    godag = get_godag("go-basic.obo")
    # Associations
    fin_gaf = os.path.join(REPO, "tair.gaf")
    gene2gos_all = read_annotations(gaf=fin_gaf, namespace='ALL')
    gene2gos_bp = read_annotations(gaf=fin_gaf, namespace='BP')
    gene2gos_mf = read_annotations(gaf=fin_gaf, namespace='MF')
    gene2gos_cc = read_annotations(gaf=fin_gaf, namespace='CC')
    # Termcounts
    prt = sys.stdout
    termcounts_all = TermCounts(godag, gene2gos_all, prt=prt)
    termcounts_bp = TermCounts(godag, gene2gos_bp, prt=prt)
    termcounts_mf = TermCounts(godag, gene2gos_mf, prt=prt)
    termcounts_cc = TermCounts(godag, gene2gos_cc, prt=prt)
    # Test content in subset is the same as in the full GO counts
    for goid, cnt in termcounts_bp.gocnts.items():
        assert termcounts_all.gocnts[goid] == cnt
    for goid, cnt in termcounts_mf.gocnts.items():
        assert termcounts_all.gocnts[goid] == cnt
    for goid, cnt in termcounts_cc.gocnts.items():
        assert termcounts_all.gocnts[goid] == cnt
Esempio n. 16
0
def test_i148b_semsim_lin(do_plt=False):
    """Test for issue 148, Lin Similarity if a term has no annotations"""
    fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf')
    godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo"))
    annoobj = GafReader(fin_gaf, godag=godag)

    associations = annoobj.get_id2gos('CC')
    tcntobj = TermCounts(godag, associations)

    if do_plt:
        _do_plt(tcntobj, godag)

    goids = list(godag.keys())

    ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0))
    ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0))
    ##return

    # Calculate Resnik values
    p2r = {
        frozenset([a, b]): resnik_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Resnik', goids, p2r)

    # Calculate Lin values
    p2l = {
        frozenset([a, b]): lin_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Lin', goids, p2l)
    _chk_lin(p2l)
    return

    # Calculate Resnik values
    p2r = {
        frozenset([a, b]): resnik_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Resnik', goids, p2r)

    # Calculate Lin values
    p2l = {
        frozenset([a, b]): lin_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Lin', goids, p2l)
    _chk_lin(p2l)
Esempio n. 17
0
 def __init__(self, args=None, prt=sys.stdout):
     self.kws = DocOptParse(__doc__, self.kws_dct_all, self.kws_set_all).get_docargs(
         args, intvals=set(['max_indent', 'dash_len']))
     opt_attrs = OboOptionalAttrs.attributes.intersection(self.kws.keys())
     godag = get_godag(self.kws['dag'], prt, optional_attrs=opt_attrs)
     self.gene2gos = read_annotations(**self.kws)
     self.tcntobj = TermCounts(godag, self.gene2gos) if self.gene2gos is not None else None
     self.gosubdag = GoSubDag(godag.keys(), godag,
                              relationships='relationship' in opt_attrs,
                              tcntobj=self.tcntobj,
                              children=True,
                              prt=prt)
     self.goids = self._init_goids()
     self._adj_item_marks()
     self._adj_include_only()
     self._adj_for_assc()
Esempio n. 18
0
def test_semantic_similarity():
    """Test initializing TermCounts with annotations made to alternate GO ID"""
    godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo'))
    file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.anno')
    name2go = {o.name: o.item_id for o in godag.values()}
    assoc = _get_id2gos(file_id2gos, godag, name2go, NAME2NUM)
    tcntobj = TermCounts(godag, assoc)
    # N_v: Test accuracy of Python equivalent to Java: getNumberOfAnnotations
    # Test number of unique genes annotated to a GO Term PLUS genes annotated to a descendant
    assert tcntobj.gocnts[name2go['A']] == 100, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['B']] == 40, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['C']] == 50, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['D']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['E']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['F']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['G']] == 30, tcntobj.gocnts
Esempio n. 19
0
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"),
                      loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(
        os.path.join(os.getcwd(), 'gene_association.tair'), godag)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))
    assert sim_r, "FATAL RESNIK SCORE"

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
    assert sim_l, "FATAL LIN SCORE"
Esempio n. 20
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    paths = utils.read_paths(args.paths_file)

    go = obo_parser.GODag(args.obo_file)
    gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606])
    termcounts = TermCounts(go, gene2go)

    if args.namespace is not None:
        if args.namespace == 'cc':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'cellular_component'
            }
        elif args.namespace == 'mf':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'molecular_function'
            }
        elif args.namespace == 'bp':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'biological_process'
            }
        else:
            raise ValueError('namespace can be only cc, mf or bp')

    wrapped = [[path, go, gene2go, termcounts] for path in paths]
    if args.n_cores > 1:
        sims = list(p_map(wrap, wrapped))
    else:
        sims = list(map(wrap, tqdm(wrapped)))

    utils.create_dir_if_not_exist(dirname(args.out_sims_file))
    np.savetxt(args.out_sims_file, sims)
Esempio n. 21
0
 def __init__(self, obo, gaf, prt):
     self.prt = prt
     self.cwd = os.getcwd()
     # Gene Ontologies
     self.go2obj_all = get_godag(os.path.join(REPO, "../goatools/", obo))
     # Annotations
     #_file_gaf = dnld_gaf(os.path.join(REPO, gaf))
     _file_gaf = dnld_gaf(gaf)
     print("GAF: {GAF}\n".format(GAF=_file_gaf))
     self.gene2gos = read_gaf(_file_gaf)
     self.tcntobj = TermCounts(self.go2obj_all, self.gene2gos)
     # GoSubDag
     self.gosubdag_all = GoSubDag(None,
                                  self.go2obj_all,
                                  tcntobj=self.tcntobj,
                                  prt=prt)
     self.prtfmt = self.gosubdag_all.prt_attr['fmta']
Esempio n. 22
0
def test_semantic_i88():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    goids = set(go for go, o in godag.items() if go == o.id)
    goids = set(godag.keys())
    # Get all the annotations from arabidopsis.
    fin_gaf = os.path.join(REPO, "tair.gaf")
    # dnld_assc includes read_gaf
    associations = dnld_assc(fin_gaf, godag, prt=None)

    # First get the counts and information content for each GO term.
    termcounts = TermCounts(godag, associations)
    gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    go_root = deepest_common_ancestor([go_id3, go_id4], godag)
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    gosubdag.prt_goids([go_root, go_id3, go_id4])

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
Esempio n. 23
0
def test_semantic_similarity(usr_assc=None):
    """Computing basic semantic similarities between GO terms."""
    not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'}
    associations = sorted(ASSOCIATIONS.difference(not_these))
    go2obj = get_go2obj()
    # goids = go2obj.keys()
    # http://current.geneontology.org/annotations/
    if usr_assc is not None:
        associations = [usr_assc]
    cwd = os.getcwd()
    not_found = set()
    for assc_name in associations:  # Limit test numbers for speed
        tic = timeit.default_timer()
        # Get all the annotations from arabidopsis.
        assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name),
                                  go2obj,
                                  prt=sys.stdout)
        if not assc_gene2gos:
            not_found.add(assc_name)
            continue

        # Calculate the information content of the single term, GO:0048364
        #       "Information content (GO:0048364) = 7.75481392334

        # First get the counts of each GO term.
        termcounts = TermCounts(go2obj, assc_gene2gos)
        go_cnt = termcounts.gocnts.most_common()
        #print termcounts.gocnts.most_common()

        if go_cnt:
            print("{ASSC}".format(ASSC=assc_name))
            print(sorted(termcounts.aspect_counts.most_common()))
            gocnt_max = go_cnt[0][1]
            prt_info(termcounts, go_cnt, None)
            prt_info(termcounts, go_cnt, gocnt_max / 2.0)
            prt_info(termcounts, go_cnt, gocnt_max / 10.0)
        print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name,
                                            HMS=_hms(TIC),
                                            hms=_hms(tic)))
    print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations)))
    if not_found:
        _prt_not_found(not_found)
def _test_path_bp_mf(branch_dist, godag, prt):
    """Test distances between BP branch and MF branch."""
    go_mf = 'GO:0003676'  # level-03 depth-03 nucleic acid binding [molecular_function]
    go_bp = 'GO:0007516'  # level-04 depth-05 hemocyte development [biological_process]
    dst_none = semantic_distance(go_mf, go_bp, godag)
    sim_none = semantic_similarity(go_mf, go_bp, godag)
    assc = dnld_assc("gene_association.tair", godag)
    termcounts = TermCounts(godag, assc)
    fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n'
    sim_r = resnik_sim(go_mf, go_bp, godag, termcounts)
    sim_l = lin_sim(go_mf, go_bp, godag, termcounts)
    if prt is not None:
        prt.write(
            fmt.format(TYPE='semantic distance',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=dst_none))
        prt.write(
            fmt.format(TYPE='semantic similarity',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_none))
        prt.write(
            fmt.format(TYPE='Resnik similarity',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_r))
        prt.write(
            fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l))
    assert dst_none is None
    assert sim_none is None
    assert sim_r is None
    assert sim_l is None
    sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist)
    if prt is not None:
        prt.write(
            fmt.format(TYPE='semantic distance',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_d))
    assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist
Esempio n. 25
0
def _run_each(fin_gpad, godag):
    """Load one annoation (BP, MF, CC) at a time"""
    top_cnt_ns = {}
    for nspc in ['BP', 'MF', 'CC']:
        namespace = NS2NAMESPACE[nspc]
        top_ns = NS2GO[nspc]
        annoobj = GpadReader(fin_gpad, godag=godag, namespaces={nspc})
        ns2assoc = annoobj.get_ns2assc()
        id2gos = ns2assoc[nspc]
        print('{NS} NUM ASSOC {N:6,}'.format(NS=nspc, N=len(ns2assoc[nspc])))
        tcntobj = TermCounts(godag, id2gos)
        print('{NS} GOATOOLS TERM COUNTS: {N}/{M}'.format(
            NS=nspc,
            N=tcntobj.gocnts[top_ns],
            M=tcntobj.aspect_counts[namespace]))
        top_cnt = tcntobj.gocnts[top_ns]
        top_cnt_ns[nspc] = top_cnt
        assert top_cnt == tcntobj.aspect_counts[namespace], '{NS} {A} != {B}'.format(
            NS=nspc, A=top_cnt, B=tcntobj.aspect_counts[namespace])
        assert top_cnt == max(tcntobj.gocnts.values())
    return top_cnt_ns
Esempio n. 26
0
def test_i148_semsim_lin(prt=sys.stdout):
    """Test for issue 148, Lin Similarity if a term has no annotations"""
    fin_gpad = os.path.join(REPO, 'goa_human.gpad')
    dnld_annofile(fin_gpad, 'gpad')

    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    annoobj = GpadReader(fin_gpad, godag=godag)

    goids = [
        'GO:0042581', 'GO:0101002', 'GO:0042582', 'GO:0070820', 'GO:0008021',
        'GO:0005766', 'GO:0016591'
    ]

    associations = annoobj.get_id2gos('CC')
    termcounts = TermCounts(godag, associations)

    # Calculate Lin values
    p2v = {
        frozenset([a, b]): lin_sim(a, b, godag, termcounts)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values(goids, p2v, prt=sys.stdout)
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    # Get all the annotations from arabidopsis.
    associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz")


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    def __init__(self,
                 go_file,
                 go_terms,
                 gaf,
                 omadb=None,
                 tarfile_ortho=None,
                 TermCountsFile=None):
        self.go_file = go_file

        if omadb:
            print('open oma db obj')
            from pyoma.browser import db
            h5_oma = open_file(omadb, mode="r")
            self.db_obj = db.Database(h5_oma)
            print('done')
        elif tarfile_ortho:
            #retrieve hog members from tarfile_ortho
            self.tar = tarfile.open(tarfile_ortho, "r:gz")
        else:
            raise Exception('please provide input dataset')

        #go_terms_hdf5 = h5py.File(go_terms, mode='r')
        #self.goterms2parents = go_terms_hdf5['goterms2parents']
        self.godf = pickle.loads(open(go_terms, 'rb').read())
        self.go_file = obo_parser.GODag(go_file)
        print('building gaf')
        self.gaf = goatools_utils.buildGAF(gaf)
        print('done')
        if TermCountsFile is None:
            self.termcounts = TermCounts(self.go_file, self.gaf)
        else:
            self.termcounts = pickle.loads(open(TermCountsFile, 'rb').read())
        #make a partial
        self.resniksimpreconf = partial(goatools_utils.resnik_sim_pandas,
                                        df=self.godf,
                                        termcounts=self.termcounts)
Esempio n. 29
0
def get_termcounts(fin_anno, godag, namespace='all', **kws):
    """Get termcounts object"""
    objanno = get_objanno(fin_anno, godag, namespace)
    id2gos = objanno.get_id2gos(namespace=namespace, **kws)
    return TermCounts(godag, id2gos)
Esempio n. 30
0
obo_file = os.path.join(id_mapping_dir, 'go.obo')

if not os.path.exists(obo_file):
    print("Using ontology for first time")
    print("Downloading files")
    from magine.enrichment.databases.gene_ontology import \
        download_and_process_go
    download_and_process_go()
    assert os.path.exists(obo_file)

go = obo_parser.GODag(obo_file)

mg = MagineGO()
print("Loading termcounts")
associations = mg.gene_to_go
termcounts = TermCounts(go, associations)
print("Loaded termcounts")


def path_to_root(go_term):
    """
    Creates networkx graph from provided term to root term

    Parameters
    ----------
    go_term : str
        source GO term

    Returns
    -------
    graph : nx.DiGraph