Beispiel #1
0
def test_gaf_read(log=sys.stdout):
    """Return GO associations from a GAF file. Download if necessary."""
    # Get associations for human(9606), mouse(10090), and fly(7227)
    species_ids = ['goa_human', 'mgi', 'fb']
    # (optional) multi-level dictionary separate associations by taxid
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    local_dir = os.path.dirname(os.path.abspath(__file__))
    for fin_gaf in dnld_gafs(species_ids):
        fin_gaf = os.path.join(local_dir, fin_gaf)
        log.write("\n")
        id2gos = read_gaf(fin_gaf, taxid2asscs=taxid2asscs)
        if "gene_association.mgi" in fin_gaf:
            _chk_key(id2gos, "MGI:")
        log.write("  {N:>6,} IDs found in {F}\n".format(N=len(id2gos), F=fin_gaf))
        go2ids = read_gaf(fin_gaf, go2geneids=True)
        _chk_key(go2ids, "GO:")
        log.write("  {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf))
    # Report findings stored in optional taxid dictionary
    log.write("\n")
    for taxid, asscs in taxid2asscs.items():
        num_gene2gos = len(asscs.get('ID2GOs'))
        num_go2genes = len(asscs.get('GO2IDs'))
        log.write("{N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n".format(
            TAXID=taxid, N=num_go2genes, M=num_gene2gos))
        # Basic check to ensure gene2go was downloaded and data was returned.
        assert num_gene2gos > 11000
        assert num_go2genes > 6000
Beispiel #2
0
def test_gaf_read(log=sys.stdout):
    """Return GO associations from a GAF file. Download if necessary."""
    # On 2017/04/10, there were 3 GO IDs with ND Evidence Codes:
    #
    #    $ cut -f5,7 goa_human.gaf | grep ND | sort | uniq -c
    #        739 GO:0003674      ND
    #        484 GO:0005575      ND
    #        639 GO:0008150      ND

    # Example species_ids: goa_human mgi fb
    fin_gaf = dnld_gaf('goa_human', loading_bar=None)

    # Example 1: Read GAF
    go2ids = read_gaf(fin_gaf, go2geneids=True)
    num_gos_dflt = len(go2ids)
    log.write("Read {N} GOs with all default values\n\n".format(N=num_gos_dflt))

    # Example 2: Read GAF using defaults (No NOT Qualifiers and no ND Evidence Codes)
    go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=False, keep_NOT=False)
    log.write("Read {N} GOs; keepif is default in goatools.associations.read_gaf\n\n".format(
        N=len(go2ids)))

    # Example 3: Read GAF allowing GOs with ND Evidence Codes
    go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True)
    log.write("Read {N} GOs; Allow ND Evidence codes\n\n".format(N=len(go2ids)))

    # Example 4: Read GAF allowing all GOs, even those with NOT Qualifiers or ND Evidence Codes
    go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True, keep_NOT=True)
    log.write("Read {N} GOs; Allow ND Evidence codes and NOT Qualifiers\n\n".format(N=len(go2ids)))
def test_gaf_read(log=sys.stdout):
    """Return GO associations from a GAF file. Download if necessary."""
    # On 2017/04/10, there were 3 GO IDs with ND Evidence Codes:
    #
    #    $ cut -f5,7 goa_human.gaf | grep ND | sort | uniq -c
    #        739 GO:0003674      ND
    #        484 GO:0005575      ND
    #        639 GO:0008150      ND

    # Example species_ids: goa_human mgi fb
    fin_gaf = dnld_gaf('goa_human', loading_bar=None)

    # Example 1: Read GAF
    go2ids = read_gaf(fin_gaf, go2geneids=True)
    num_gos_dflt = len(go2ids)
    log.write("Read {N} GOs with all default values\n\n".format(N=num_gos_dflt))

    # Example 2: Read GAF using defaults (No NOT Qualifiers and no ND Evidence Codes)
    go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=False, keep_NOT=False)
    log.write("Read {N} GOs; keepif is default in goatools.associations.read_gaf\n\n".format(
        N=len(go2ids)))

    # Example 3: Read GAF allowing GOs with ND Evidence Codes
    go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True)
    log.write("Read {N} GOs; Allow ND Evidence codes\n\n".format(N=len(go2ids)))

    # Example 4: Read GAF allowing all GOs, even those with NOT Qualifiers or ND Evidence Codes
    go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True, keep_NOT=True)
    log.write("Read {N} GOs; Allow ND Evidence codes and NOT Qualifiers\n\n".format(N=len(go2ids)))
def _test_gaf_read(msg, species_ids, keepif, log=sys.stdout):
    # (optional) multi-level dictionary separate associations by taxid
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    local_dir = os.path.dirname(os.path.abspath(__file__))
    for fin_gaf in dnld_gafs(species_ids, loading_bar=None):
        fin_gaf = os.path.join(local_dir, fin_gaf)
        log.write("\n")
        id2gos_bp = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif)
        id2gos_all = read_gaf(fin_gaf,
                              taxid2asscs=taxid2asscs,
                              keepif=keepif,
                              namespace='all')
        assert len(id2gos_all) > len(id2gos_bp)
        if "mgi.gaf" in fin_gaf:
            _chk_key(id2gos_bp, "MGI:")
        log.write("  {N:>6,} IDs found in BP  {F}\n".format(N=len(id2gos_bp),
                                                            F=fin_gaf))
        log.write("  {N:>6,} IDs found in ALL {F}\n".format(N=len(id2gos_all),
                                                            F=fin_gaf))
        go2ids = read_gaf(fin_gaf, go2geneids=True, keepif=keepif)
        _chk_key(go2ids, "GO:")
        log.write("  {N:>6,} GOs found in {F}\n".format(N=len(go2ids),
                                                        F=fin_gaf))
    # Report findings stored in optional taxid dictionary
    log.write("\n{MSG}\n".format(MSG=msg))
    txtpat = "  {N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n"
    for taxid, asscs in taxid2asscs.items():
        num_gene2gos = len(asscs.get('ID2GOs'))
        num_go2genes = len(asscs.get('GO2IDs'))
        log.write(txtpat.format(TAXID=taxid, N=num_go2genes, M=num_gene2gos))
        # Basic check to ensure gene2go was downloaded and data was returned.
        assert num_gene2gos > 11000
        assert num_go2genes > 6000
def test_missingsym():
    """Tests read a GAF with missing (required) DB_Symbol text."""
    # Original gaf file (gene_association.mgi) was reduced
    fin_gaf = "tests/data/gaf_missingsym.mgi"
    # Test that gene products that are missing the required DB_Symbol are ignored
    gene2gos = read_gaf(os.path.join(REPO, fin_gaf))
    assert len(gene2gos) == 16, len(gene2gos)
    assert 'MGI:3643263' not in gene2gos
    assert 'P84751' not in gene2gos
    # Tests saving annotation, even if missing required DB_Symbol
    gene2gos = read_gaf(os.path.join(REPO, fin_gaf), allow_missing_symbol=True)
    assert len(gene2gos) == 18
    assert 'MGI:3643263' in gene2gos
    assert 'P84751' in gene2gos
def _get_assc(godag):
    """Get association reduced for the test subset of the GO DAG."""
    fin_assc = "http://geneontology.org/gene-associations/gene_association.tair.gz"
    assc = {}
    goids_dag = set(godag.keys())
    for gene, goids_cur in read_gaf(fin_assc).items():
        assc[gene] = goids_cur.intersection(goids_dag)
    return assc
def test_gaf_read(log=sys.stdout):
    """Return GO associations from a GAF file. Download if necessary."""
    # Get associations for human(9606), mouse(10090), and fly(7227)
    species_ids = ['goa_human', 'mgi', 'fb']
    # (optional) multi-level dictionary separate associations by taxid
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    fin_gafs = dnld_gafs(species_ids)
    for fin_gaf in fin_gafs:
        id2gos = read_gaf(fin_gaf, taxid2asscs=taxid2asscs)
        log.write("  {N:>6,} IDs found in {F}\n".format(N=len(id2gos), F=fin_gaf))
        go2ids = read_gaf(fin_gaf, go2geneids=True)
        log.write("  {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf))
    # Report findings stored in optional taxid dictionary
    for taxid, asscs in taxid2asscs.items():
        num_gene2gos = len(asscs['ID2GOs'])
        num_go2genes = len(asscs['GO2IDs'])
        log.write("{N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n".format(
            TAXID=taxid, N=num_go2genes, M=num_gene2gos))
        # Basic check to ensure gene2go was downloaded and data was returned.
        assert num_gene2gos > 11000
        assert num_go2genes > 6000
    def get_associations(self, ontology=None):
        """Get associations of gene IDs to GO terms.

        Ontologies: P = biological process, F = molecular function,
            C = cellular component

        # Arguments
            ontology: str (optional), one of {"P", "F", "C"}

        # Returns
            dict: maps gene IDs to the GO terms it is annotated them

        # Raises
            GeneOntologyError: if `ontology` is not valid
        """
        if ontology is not None and ontology not in ("P", "F", "C"):
            raise GeneOntologyError(f"Not a valid ontology: {ontology}")

        # Load a defaultdict mapping gene_ids to the GO terms annotated to them
        # if not hasattr(self, "all_associations"):
        associations = read_gaf(self.associations_path)

        mapping = {rec['DB_Object_Symbol']: rec['DB_Object_ID'] for rec in self}

        self.all_associations = {}
        for i in associations:
            if i in mapping:
                self.all_associations[mapping[i]] = associations[i]

        all_associations = copy.deepcopy(self.all_associations)

        # Remove genes that do not have any annotations with an accepted
        # evidence code
        wanted_genes = set(rec["DB_Object_ID"] for rec in self)
        associations = self.remove_unwanted_genes(wanted_genes, all_associations)
        # Only consider GO terms from a particular ontology
        if ontology is not None:
            # term2ontology_dict = self.term2ontology()
            d = self.ontology2term()
            accepted_terms = d[ontology]
            # Iterate over dictionary of associations
            for gene, go_terms in associations.items():
                for go_id in go_terms.copy():
                    # Remove obsolete terms
                    if go_id in self.go_dag:
                        # Remove GO terms from other ontologies
                        if go_id not in accepted_terms:
                            go_terms.remove(go_id)

        self.associations = associations
        return associations
 def __init__(self, obo, gaf, prt):
     self.prt = prt
     self.cwd = os.getcwd()
     # Gene Ontologies
     self.go2obj_all = get_godag(os.path.join(REPO, "../goatools/", obo))
     # Annotations
     #_file_gaf = dnld_gaf(os.path.join(REPO, gaf))
     _file_gaf = dnld_gaf(gaf)
     print("GAF: {GAF}\n".format(GAF=_file_gaf))
     self.gene2gos = read_gaf(_file_gaf)
     self.tcntobj = TermCounts(self.go2obj_all, self.gene2gos)
     # GoSubDag
     self.gosubdag_all = GoSubDag(None, self.go2obj_all, tcntobj=self.tcntobj, prt=prt)
     self.prtfmt = self.gosubdag_all.prt_attr['fmta']
Beispiel #10
0
def test_gaf_read(log=sys.stdout):
    """Return GO associations from a GAF file. Download if necessary."""
    # Get associations for human(9606), mouse(10090), and fly(7227)
    species_ids = ['goa_human', 'mgi', 'fb']
    # (optional) multi-level dictionary separate associations by taxid
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    fin_gafs = dnld_gafs(species_ids)
    for fin_gaf in fin_gafs:
        id2gos = read_gaf(fin_gaf, taxid2asscs=taxid2asscs)
        log.write("  {N:>6,} IDs found in {F}\n".format(N=len(id2gos),
                                                        F=fin_gaf))
        go2ids = read_gaf(fin_gaf, go2geneids=True)
        log.write("  {N:>6,} GOs found in {F}\n".format(N=len(go2ids),
                                                        F=fin_gaf))
    # Report findings stored in optional taxid dictionary
    for taxid, asscs in taxid2asscs.items():
        num_gene2gos = len(asscs['ID2GOs'])
        num_go2genes = len(asscs['GO2IDs'])
        log.write(
            "{N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n"
            .format(TAXID=taxid, N=num_go2genes, M=num_gene2gos))
        # Basic check to ensure gene2go was downloaded and data was returned.
        assert num_gene2gos > 11000
        assert num_go2genes > 6000
def _test_gaf_read(msg, species_ids, keepif, log=sys.stdout):
    # (optional) multi-level dictionary separate associations by taxid
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    local_dir = os.path.dirname(os.path.abspath(__file__))
    for fin_gaf in dnld_gafs(species_ids, loading_bar=None):
        fin_gaf = os.path.join(local_dir, fin_gaf)
        log.write("\n")
        id2gos = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif)
        if "mgi.gaf" in fin_gaf:
            _chk_key(id2gos, "MGI:")
        log.write("  {N:>6,} IDs found in {F}\n".format(N=len(id2gos), F=fin_gaf))
        go2ids = read_gaf(fin_gaf, go2geneids=True, keepif=keepif)
        _chk_key(go2ids, "GO:")
        log.write("  {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf))
    # Report findings stored in optional taxid dictionary
    log.write("\n{MSG}\n".format(MSG=msg))
    txtpat = "  {N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n"
    for taxid, asscs in taxid2asscs.items():
        num_gene2gos = len(asscs.get('ID2GOs'))
        num_go2genes = len(asscs.get('GO2IDs'))
        log.write(txtpat.format(TAXID=taxid, N=num_go2genes, M=num_gene2gos))
        # Basic check to ensure gene2go was downloaded and data was returned.
        assert num_gene2gos > 11000
        assert num_go2genes > 6000
 def __init__(self, obo, gaf, prt):
     self.prt = prt
     self.cwd = os.getcwd()
     # Gene Ontologies
     self.go2obj_all = get_godag(os.path.join(REPO, "../goatools/", obo))
     # Annotations
     #_file_gaf = dnld_gaf(os.path.join(REPO, gaf))
     _file_gaf = dnld_gaf(gaf)
     print("GAF: {GAF}\n".format(GAF=_file_gaf))
     self.gene2gos = read_gaf(_file_gaf)
     self.tcntobj = TermCounts(self.go2obj_all, self.gene2gos)
     # GoSubDag
     self.gosubdag_all = GoSubDag(None,
                                  self.go2obj_all,
                                  tcntobj=self.tcntobj,
                                  prt=prt)
     self.prtfmt = self.gosubdag_all.prt_attr['fmta']
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    # Get all the annotations from arabidopsis.
    associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz")


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
Beispiel #14
0
    query_result_list = []
    for genes_chunk in np.array_split(genes, max(genes.shape[0] // 1000, 1)):
        query_res = mg.querymany(genes_chunk, scopes='entrezgene', fields='entrezgene,symbol',
                                 species='human', entrezonly=True, as_dataframe=True,
                                 df_index=False, verbose=False)
        if 'notfound' in query_res.columns:
            query_res = query_res[query_res.notfound != True]  # ignore PEP8 warnings.
        query_result_list.append(query_res)
    df_res = pd.concat(query_result_list)
    res = dict(zip(df_res.entrezgene, df_res.symbol))
    return res


with HidePrints():
    _go_dag = obo_parser.GODag(go_obo_path)
    _gaf = read_gaf(gaf_path, prt=None)
    _termcounts = TermCounts(_go_dag, _gaf)
    _gene2go = read_ncbi_gene2go(gene2go_path)
    _gene2symbol = _init_gene2symbol_dict()
    _symbol2gene = {symbol: gene for gene, symbol in _gene2symbol.items()}


def get_genes():
    return list(_gene2go.keys())


def get_symbols():
    return list(_gene2symbol.values())


def get_gene2go():
Beispiel #15
0
                    qtl_old_info[qtl][i].append(entry[i])
        else:
            qtl_old_info[qtl] = [[] for c in c2g[d]]
    for i in range(len(c2g[d])):
        mhq_dat[d + '_' + c2g[d][i]] = mhq_dat['QTL'].apply(
            lambda q: ';'.join([str(c) for c in qtl_old_info[q][i]]))

mhq_dat.to_csv('../../Analysis/Multi_hit_QTLs.csv', index=False)

#GO term analysis, modified from https://github.com/tanghaibao/goatools/blob/master/notebooks/goea_nbt3102.ipynb

# Get http://geneontology.org/ontology/go-basic.obo
obo_fname = download_go_basic_obo()
obodag = GODag("go-basic.obo")
geneid2gos_yeast = read_gaf(
    '../accessory_files/gene_association.sgd'
)  #http://downloads.yeastgenome.org/curation/literature/gene_association.sgd.gz
genename_2_id = dict()
with open('../accessory_files/gene_association.sgd', 'r') as infile:
    for line in infile:
        if line[0] != '!':
            s = line.split('\t')
            genename_2_id[s[2]] = s[1]

id_2_genename = {genename_2_id[i]: i for i in genename_2_id}
ids = [i for i in geneid2gos_yeast.keys()]

all_measured_genes = set(tp.loc[tp['num.measured'] >= 50]['Gene.Use'].apply(
    lambda s: s.split(' ')[1]))
background_set = [
    genename_2_id.setdefault(i, 'NA') for i in all_measured_genes
Beispiel #16
0
def goe(
    genelist,
    go_file,
    goa_file,
    bg=None,
    nmin=5,
    conversion=None,
    evidence_set={
        'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'HTP', 'HDA', 'HMP', 'HGI', 'IBA',
        'IBD', 'IKR', 'IRD', 'ISS', 'ISO', 'ISA', 'ISM'
    }):
    """Finds GO enrichment with goatools (0.7.11 tested).

	**WARNING**\ : This method is inexact for multi-maps in gene name conversion. However, it has a negligible effect in top GO component removal in single-cell co-expression.

	Parameters
	------------
	genelist:	list of str
		Genes to search for enrichment.
	go_file:	str
		File path for GO DAG (downloadable at http://geneontology.org/docs/download-ontology/)).
	goa_file:	str
		File path for GO associations. See parameter **conversion**.
	bg:			list of str
		Background genes.
	nmin:		int
		Minimum number of principal genes required in GO.
	conversion:	tuple
		Conversion of `gene ID system <https://docs.mygene.info/en/latest/doc/data.html>`_ from gene list to the GO annotation.

		* name_from:	Gene naming system of genelist. For gene names, use 'symbol,alias'.
		* name_to:		Gene naming system of goa_file. Examples:

			* Human: use 'uniprot.Swiss-Prot' (for GO annotations downloded from http://geneontology.org/gene-associations/goa_human.gaf.gz).
			* Mouse: use 'MGI' (for GO annotations downloded from http://current.geneontology.org/annotations/mgi.gaf.gz).

		* species:		Species for gene name conversion. Examples: 'human', 'mouse'.

	evidence_set:	set of str
		`GO evidences <http://geneontology.org/docs/guide-go-evidence-codes/>`_ to include. Defaults to non-expression based results to avoid circular reasoning bias.

	Returns
	----------
	goe:		pandas.DataFrame
		GO enrichment.
	gotop:		str
		Top enriched GO ID
	genes:		list of str or None
		Intersection list of genes in gotop and also bg. None if bg is None.

	"""
    from tempfile import NamedTemporaryFile
    from os import linesep
    from goatools.go_enrichment import GOEnrichmentStudy
    from goatools.obo_parser import GODag
    from goatools.associations import read_gaf
    from collections import defaultdict
    import itertools
    from biothings_client import get_client
    import pandas as pd
    import logging
    assert type(genelist) is list and len(genelist) > 0
    if nmin < 1:
        nmin = 1

    bg0 = bg
    # Convert gene names
    if conversion is not None:
        assert len(conversion) == 3
        name_from, name_to, species = conversion
        mg = get_client('gene')
        ans = set(genelist)
        if bg is not None:
            t1 = set(bg)
            assert len(ans - t1) == 0
            ans |= t1
        ans = list(ans)
        ans = mg.querymany(ans,
                           scopes=name_from,
                           fields=name_to,
                           species=species)
        t1 = set(['query', '_score', name_to.split('.')[0]])
        ans = list(filter(lambda x: len(t1 - set(x)) == 0, ans))
        ans = sorted(ans, key=lambda x: x['_score'])
        convert = {x['query']: x for x in ans}
        for xi in name_to.split('.'):
            convert = filter(lambda x: xi in x[1], convert.items())
            convert = {x[0]: x[1][xi] for x in convert}
        convert = {
            x[0]: x[1] if type(x[1]) is str else x[1][0]
            for x in convert.items()
        }
        genelist2 = list(
            set([convert[x]
                 for x in filter(lambda x: x in convert, genelist)]))
        if bg is not None:
            bg = list(
                set([convert[x] for x in filter(lambda x: x in convert, bg)]))
        t1 = set(genelist)
        converti = list(filter(lambda x: x[0] in t1, convert.items()))
        t1 = defaultdict(list)
        for xi in converti:
            t1[xi[1]].append(xi[0])
        converti = dict(t1)
        t1 = defaultdict(list)
        for xi in convert.items():
            t1[xi[1]].append(xi[0])
        convertia = dict(t1)
    else:
        genelist2 = genelist

    # Load GO DAG and association files
    logging.debug('Reading GO DAG file ' + go_file)
    godag = GODag(go_file)
    logging.debug('Reading GO association file ' + goa_file)
    goa = read_gaf(goa_file, evidence_set=evidence_set)
    if bg is None:
        bg = list(goa.keys())

    # Compute enrichment
    goe = GOEnrichmentStudy(bg, goa, godag)
    ans = goe.run_study(genelist2)
    # Format output
    with NamedTemporaryFile() as f:
        goe.wr_tsv(f.name, ans)
        ans = f.read()
    ans = ans.decode()
    ans = [x.split('\t') for x in ans.split(linesep)]
    if len(ans[-1]) < 2:
        ans = ans[:-1]
    if len(ans) == 0 or len(ans[0]) == 0:
        raise ValueError('No enrichment found. Check your input ID type.')
    ans[0][0] = ans[0][0].strip('# ')
    ans = pd.DataFrame(ans[1:], columns=ans[0])
    ans.drop(['NS', 'enrichment', 'study_count', 'p_sidak', 'p_holm'],
             axis=1,
             inplace=True)
    for xj in ['p_uncorrected', 'p_bonferroni']:
        ans[xj] = pd.to_numeric(ans[xj], errors='raise')
    ans['depth'] = pd.to_numeric(ans['depth'],
                                 errors='raise',
                                 downcast='unsigned')
    # Odds ratio column and sort column
    ans['odds_ratio'] = toratio(ans['ratio_in_study']) / toratio(
        ans['ratio_in_pop'])
    ans = ans[[
        'name', 'depth', 'p_uncorrected', 'p_bonferroni', 'odds_ratio',
        'ratio_in_study', 'ratio_in_pop', 'GO', 'study_items'
    ]]
    ans['study_items'] = ans['study_items'].apply(lambda x: x.replace(' ', ''))
    # Convert back study_items
    if conversion is not None:
        ans['study_items'] = ans['study_items'].apply(lambda x: ','.join(
            list(
                itertools.chain.from_iterable(
                    [converti[y] for y in x.split(',')])))
                                                      if len(x) > 0 else x)
    ans.sort_values('p_uncorrected', inplace=True)

    # Get top enriched GO by P-value
    gotop = ans[
        (ans['odds_ratio'] > 1)
        & ans['ratio_in_study'].apply(lambda x: int(x.split('/')[0]) >= nmin)]
    if len(gotop) == 0:
        raise ValueError('No GO enrichment found for given criteria.')
    gotop = str(gotop.iloc[0]['GO'])
    if bg0 is not None:
        # Children GOs
        gos = set([gotop] + list(godag.query_term(gotop).get_all_children()))
        # Look for genes
        genes = list(
            filter(lambda x: len(list(filter(lambda y: y in gos, goa[x]))) > 0,
                   goa))
        if conversion is not None:
            genes = [
                convertia[x] for x in filter(lambda x: x in convertia, genes)
            ]
            genes = list(set(list(itertools.chain.from_iterable(genes))))
        genes = set(genes)
        genes = list(filter(lambda x: x in genes, bg0))
    else:
        genes = None
    return (ans, gotop, genes)
Beispiel #17
0
# Convert ORF names to SGDIDs for GO analysis
multi_hit_sgdids = list(gene_info[gene_info['ORF'].isin(orf_names)]['SGDID'])

obodag = GODag("../accessory_files/go-basic.obo"
               )  # http://geneontology.org/ontology/go-basic.obo
goid_to_gene_list = defaultdict(list)
genename_2_id = dict()
with open('../accessory_files/gene_association.sgd', 'r') as infile:
    for line in infile:
        if line[0] != '!':
            s = line.split('\t')
            goid_to_gene_list[s[4]].append(s[1])
            genename_2_id[s[2]] = s[1]
id_2_genename = {genename_2_id[i]: i for i in genename_2_id}
# Only looking at "biological process" GO terms
geneid2gos_yeast = read_gaf('../accessory_files/gene_association.sgd',
                            namespace='BP')
ids = [i for i in geneid2gos_yeast.keys()]
background_set = [genename_2_id[i] for i in genename_2_id]
goeaobj = GOEnrichmentStudy(
    background_set,  # List of all genes in analysis
    geneid2gos_yeast,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.05,  # default significance cut-off
    methods=['fdr_bh'])  # defult multipletest correction method

goea_results_all = goeaobj.run_study(multi_hit_sgdids,
                                     keep_if=lambda x: x.p_uncorrected < 0.05)
go_results = sorted(goea_results_all, key=lambda r: r.p_fdr_bh)

cols = [