Exemple #1
0
def extend_gos_by_parents(GODag, gos, logger=None):
    gos_out = set()
    for go in gos:
        gos_out.add(go)
        try:
            gos_out.update(GODag.query_term(go).get_all_parents())
        except:
            if logger:
                logger.debug('Could not get parents for term {}.'.format(go))
    return gos_out
Exemple #2
0
def count_children(GODag, go):
    return len(GODag.query_term(go).get_all_children())
        go_accs = set(interpro_go.loc[interpro_go['Protein Accession'] == protein]['GO Accession'])
        for go_acc in go_accs:
            if not pd.isnull(go_acc):
                all_go_accs_in_a_protein |= set(go_acc.split('|'))
        
        if len(all_go_accs_in_a_protein) > 0:
            for go_term in all_go_accs_in_a_protein:
                if go_term not in go:
                    continue
                    
                if USE_SLIM:
                    direct_anc, all_anc = mapslim(go_term, go, goslim)
                    all_goslim_anc_accs_in_a_protein |= all_anc
                    all_goslim_covered_anc |= (all_anc - direct_anc)

                query_term = go.query_term(go_term)
                output_table = output_table.append(pd.DataFrame({'Protein Accession': [protein], 'GO Category': [query_term.namespace], 'GO Accession': [go_term], 'GO Description': [query_term.name], 'GO Level':[query_term.level]}), ignore_index=True)

            if USE_SLIM:
                if ONLY_DIRECT:
                    all_goslim_direct_anc_accs_in_a_protein = all_goslim_anc_accs_in_a_protein - all_goslim_covered_anc
                    for goslim_term in all_goslim_direct_anc_accs_in_a_protein:
                        query_goslim_term = goslim.query_term(goslim_term)
                        output_slim_table = output_slim_table.append(pd.DataFrame({'Protein Accession': [protein], 'GO Category': [query_goslim_term.namespace], 'GOSlim Accession': [goslim_term], 'GOSlim Description': [query_goslim_term.name], 'GOSlim Level':[query_goslim_term.level]}), ignore_index=True)
                else:
                    for goslim_term in all_goslim_anc_accs_in_a_protein:
                        query_goslim_term = goslim.query_term(goslim_term)
                        output_slim_table = output_slim_table.append(pd.DataFrame({'Protein Accession': [protein], 'GO Category': [query_goslim_term.namespace], 'GOSlim Accession': [goslim_term], 'GOSlim Description': [query_goslim_term.name], 'GOSlim Level':[query_goslim_term.level]}), ignore_index=True)

    # write the output
    if opts.is_sort:
Exemple #4
0
                 dest='draw_parents',
                 help="Do not draw parents of the query term")
    p.add_option("--disable-draw-children",
                 action="store_false",
                 dest='draw_children',
                 help="Do not draw children of the query term")

    p.set_defaults(draw_parents=True)
    p.set_defaults(draw_children=True)

    opts, args = p.parse_args()

    if not len(args):
        obo_file = "go-basic.obo"
    else:
        obo_file = args[0]
        assert os.path.exists(obo_file), "file %s not found!" % obo_file

    g = GODag(obo_file)

    if opts.desc:
        g.write_dag()

    # run a test case
    if opts.term is not None:
        rec = g.query_term(opts.term, verbose=True)
        g.draw_lineage([rec], engine=opts.engine,
                       gml=opts.gml,
                       draw_parents=opts.draw_parents,
                       draw_children=opts.draw_children)
Exemple #5
0
    p.add_option(
        "--term",
        help="Write the parents and children of this query term",
    )

    opts, args = p.parse_args()

    if len(args) != 1:
        sys.exit(p.print_help())

    (obo_file, ) = args

    def description(rec):
        level = "level-{:>02}".format(rec.level)
        description = "{} [{}]".format(rec.name, rec.namespace)
        if rec.is_obsolete:
            description += " obsolete"
        alt_ids = ",".join(rec.alt_ids)
        return "\t".join((rec.item_id, level, description, alt_ids))

    g = GODag(obo_file, prt=None)
    header = "\t".join(("#id", "level", "name", "alt_ids"))
    print(header)
    for rec in sorted(set(g.values()), key=lambda x: x.item_id):
        print(description(rec))

    # run a test case
    if opts.term:
        rec = g.query_term(opts.term, verbose=True)
        g.draw_lineage([rec], verbose=True)
Exemple #6
0
from goatools.obo_parser import GODag
go_obo_file_path = ''
go_term_name = GO:0015979"
godag = GODag(obo_file=go_obo_file)
mygo = godag.query_term(go_term_name)
print go_term_name + " Parents"
print mygo.get_all_parents()
print go_term_name + " Children"
print mygo.get_all_children() 
Exemple #7
0
def load(args, dba, logfile, logger):
    gofile = DOWNLOAD_DIR + FILENAME
    if not args['--quiet']:
        print "\nParsing GO OBO file: {}".format(gofile)
    logger.info("Parsing GO OBO file: {}".format(gofile))
    godag = GODag(gofile)

    tct = dba.get_target_count(idg=False)
    if not args['--quiet']:
        print "\nProcessing {} TCRD targets".format(tct)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    ti_ct = 0
    notfnd = {}
    dba_err_ct = 0
    exp_codes = ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP']
    for t in dba.get_targets(idg=False, include_annotations=True):
        ct += 1
        p = t['components']['protein'][0]
        if 'goas' in p:
            lfe_goa_strs = []
            for d in p['goas']:
                if d['go_term'].startswith('C'):
                    continue  # only want MF/BP terms
                ev = d['evidence']
                if ev not in exp_codes:
                    continue  # only want experimental evidence GOAs
                gt = godag.query_term(d['go_id'])
                if not gt:
                    k = "%s:%s" % (d['go_id'], d['go_term'])
                    notfnd[k] = True
                    logger.error("GO term %s not found in GODag" % k)
                    continue
                if len(gt.children) == 0:  # if it's a leaf node
                    lfe_goa_strs.append("%s|%s|%s" %
                                        (d['go_id'], d['go_term'], ev))
            if lfe_goa_strs:
                rv = dba.ins_tdl_info({
                    'protein_id': p['id'],
                    'itype': 'Experimental MF/BP Leaf Term GOA',
                    'string_value': "; ".join(lfe_goa_strs)
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                ti_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "  Inserted {} new tdl_info rows".format(ti_ct)
    if len(notfnd.keys()) > 0:
        print "WARNING: {} GO terms not found in GODag. See logfile {} for details.".format(
            (len(notfnd.keys()), logfile))
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            (dba_err_ct, logfile))
Exemple #8
0
def goe(
    genelist,
    go_file,
    goa_file,
    bg=None,
    nmin=5,
    conversion=None,
    evidence_set={
        'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'HTP', 'HDA', 'HMP', 'HGI', 'IBA',
        'IBD', 'IKR', 'IRD', 'ISS', 'ISO', 'ISA', 'ISM'
    }):
    """Finds GO enrichment with goatools (0.7.11 tested).

	**WARNING**\ : This method is inexact for multi-maps in gene name conversion. However, it has a negligible effect in top GO component removal in single-cell co-expression.

	Parameters
	------------
	genelist:	list of str
		Genes to search for enrichment.
	go_file:	str
		File path for GO DAG (downloadable at http://geneontology.org/docs/download-ontology/)).
	goa_file:	str
		File path for GO associations. See parameter **conversion**.
	bg:			list of str
		Background genes.
	nmin:		int
		Minimum number of principal genes required in GO.
	conversion:	tuple
		Conversion of `gene ID system <https://docs.mygene.info/en/latest/doc/data.html>`_ from gene list to the GO annotation.

		* name_from:	Gene naming system of genelist. For gene names, use 'symbol,alias'.
		* name_to:		Gene naming system of goa_file. Examples:

			* Human: use 'uniprot.Swiss-Prot' (for GO annotations downloded from http://geneontology.org/gene-associations/goa_human.gaf.gz).
			* Mouse: use 'MGI' (for GO annotations downloded from http://current.geneontology.org/annotations/mgi.gaf.gz).

		* species:		Species for gene name conversion. Examples: 'human', 'mouse'.

	evidence_set:	set of str
		`GO evidences <http://geneontology.org/docs/guide-go-evidence-codes/>`_ to include. Defaults to non-expression based results to avoid circular reasoning bias.

	Returns
	----------
	goe:		pandas.DataFrame
		GO enrichment.
	gotop:		str
		Top enriched GO ID
	genes:		list of str or None
		Intersection list of genes in gotop and also bg. None if bg is None.

	"""
    from tempfile import NamedTemporaryFile
    from os import linesep
    from goatools.go_enrichment import GOEnrichmentStudy
    from goatools.obo_parser import GODag
    from goatools.associations import read_gaf
    from collections import defaultdict
    import itertools
    from biothings_client import get_client
    import pandas as pd
    import logging
    assert type(genelist) is list and len(genelist) > 0
    if nmin < 1:
        nmin = 1

    bg0 = bg
    # Convert gene names
    if conversion is not None:
        assert len(conversion) == 3
        name_from, name_to, species = conversion
        mg = get_client('gene')
        ans = set(genelist)
        if bg is not None:
            t1 = set(bg)
            assert len(ans - t1) == 0
            ans |= t1
        ans = list(ans)
        ans = mg.querymany(ans,
                           scopes=name_from,
                           fields=name_to,
                           species=species)
        t1 = set(['query', '_score', name_to.split('.')[0]])
        ans = list(filter(lambda x: len(t1 - set(x)) == 0, ans))
        ans = sorted(ans, key=lambda x: x['_score'])
        convert = {x['query']: x for x in ans}
        for xi in name_to.split('.'):
            convert = filter(lambda x: xi in x[1], convert.items())
            convert = {x[0]: x[1][xi] for x in convert}
        convert = {
            x[0]: x[1] if type(x[1]) is str else x[1][0]
            for x in convert.items()
        }
        genelist2 = list(
            set([convert[x]
                 for x in filter(lambda x: x in convert, genelist)]))
        if bg is not None:
            bg = list(
                set([convert[x] for x in filter(lambda x: x in convert, bg)]))
        t1 = set(genelist)
        converti = list(filter(lambda x: x[0] in t1, convert.items()))
        t1 = defaultdict(list)
        for xi in converti:
            t1[xi[1]].append(xi[0])
        converti = dict(t1)
        t1 = defaultdict(list)
        for xi in convert.items():
            t1[xi[1]].append(xi[0])
        convertia = dict(t1)
    else:
        genelist2 = genelist

    # Load GO DAG and association files
    logging.debug('Reading GO DAG file ' + go_file)
    godag = GODag(go_file)
    logging.debug('Reading GO association file ' + goa_file)
    goa = read_gaf(goa_file, evidence_set=evidence_set)
    if bg is None:
        bg = list(goa.keys())

    # Compute enrichment
    goe = GOEnrichmentStudy(bg, goa, godag)
    ans = goe.run_study(genelist2)
    # Format output
    with NamedTemporaryFile() as f:
        goe.wr_tsv(f.name, ans)
        ans = f.read()
    ans = ans.decode()
    ans = [x.split('\t') for x in ans.split(linesep)]
    if len(ans[-1]) < 2:
        ans = ans[:-1]
    if len(ans) == 0 or len(ans[0]) == 0:
        raise ValueError('No enrichment found. Check your input ID type.')
    ans[0][0] = ans[0][0].strip('# ')
    ans = pd.DataFrame(ans[1:], columns=ans[0])
    ans.drop(['NS', 'enrichment', 'study_count', 'p_sidak', 'p_holm'],
             axis=1,
             inplace=True)
    for xj in ['p_uncorrected', 'p_bonferroni']:
        ans[xj] = pd.to_numeric(ans[xj], errors='raise')
    ans['depth'] = pd.to_numeric(ans['depth'],
                                 errors='raise',
                                 downcast='unsigned')
    # Odds ratio column and sort column
    ans['odds_ratio'] = toratio(ans['ratio_in_study']) / toratio(
        ans['ratio_in_pop'])
    ans = ans[[
        'name', 'depth', 'p_uncorrected', 'p_bonferroni', 'odds_ratio',
        'ratio_in_study', 'ratio_in_pop', 'GO', 'study_items'
    ]]
    ans['study_items'] = ans['study_items'].apply(lambda x: x.replace(' ', ''))
    # Convert back study_items
    if conversion is not None:
        ans['study_items'] = ans['study_items'].apply(lambda x: ','.join(
            list(
                itertools.chain.from_iterable(
                    [converti[y] for y in x.split(',')])))
                                                      if len(x) > 0 else x)
    ans.sort_values('p_uncorrected', inplace=True)

    # Get top enriched GO by P-value
    gotop = ans[
        (ans['odds_ratio'] > 1)
        & ans['ratio_in_study'].apply(lambda x: int(x.split('/')[0]) >= nmin)]
    if len(gotop) == 0:
        raise ValueError('No GO enrichment found for given criteria.')
    gotop = str(gotop.iloc[0]['GO'])
    if bg0 is not None:
        # Children GOs
        gos = set([gotop] + list(godag.query_term(gotop).get_all_children()))
        # Look for genes
        genes = list(
            filter(lambda x: len(list(filter(lambda y: y in gos, goa[x]))) > 0,
                   goa))
        if conversion is not None:
            genes = [
                convertia[x] for x in filter(lambda x: x in convertia, genes)
            ]
            genes = list(set(list(itertools.chain.from_iterable(genes))))
        genes = set(genes)
        genes = list(filter(lambda x: x in genes, bg0))
    else:
        genes = None
    return (ans, gotop, genes)
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'GO Experimental Leaf Term Flags',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'These values are calculated by the loader app and indicate that a protein is annotated with a GO leaf term in either the Molecular Function or Biological Process branch with an experimental evidenve code.'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id':
        dataset_id,
        'table_name':
        'tdl_info',
        'where_clause':
        "itype = 'Experimental MF/BP Leaf Term GOA'"
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile {} for details.".format(
            logfile)
        sys.exit(1)

    gofile = DOWNLOAD_DIR + FILENAME
    logger.info("Parsing GO OBO file: %s" % gofile)
    godag = GODag(gofile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    tct = dba.get_target_count(idg=False)
    if not args['--quiet']:
        print "\nProcessing {} TCRD targets".format(tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    ti_ct = 0
    notfnd = {}
    dba_err_ct = 0
    exp_codes = ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP']
    for t in dba.get_targets(idg=False, include_annotations=True):
        ct += 1
        p = t['components']['protein'][0]
        if 'goas' in p:
            lfe_goa_strs = []
            for d in p['goas']:
                if d['go_term'].startswith('C'):
                    continue  # only want MF/BP terms
                ev = d['evidence']
                if ev not in exp_codes:
                    continue  # only want experimental evidence GOAs
                gt = godag.query_term(d['go_id'])
                if not gt:
                    k = "%s:%s" % (d['go_id'], d['go_term'])
                    notfnd[k] = True
                    logger.error("GO term %s not found in GODag" % k)
                    continue
                if len(gt.children) == 0:  # if it's a leaf node
                    lfe_goa_strs.append("%s|%s|%s" %
                                        (d['go_id'], d['go_term'], ev))
            if lfe_goa_strs:
                rv = dba.ins_tdl_info({
                    'protein_id': p['id'],
                    'itype': 'Experimental MF/BP Leaf Term GOA',
                    'string_value': "; ".join(lfe_goa_strs)
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                ti_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "  Inserted {} new  tdl_info rows".format(ti_ct)
    if len(notfnd.keys()) > 0:
        print "WARNING: {} GO terms not found in GODag. See logfile {} for details.".format(
            (len(notfnd.keys()), logfile))
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            (dba_err_ct, logfile))
Exemple #10
0
def get_go_protease():
    from goatools.obo_parser import GODag
    g = GODag('/home/gstupp/go/go-basic.obo')
    go_term = g.query_term('GO:0008233')
    go_protease = go_term.get_all_children()
    return go_protease
Exemple #11
0
ana_DTA = '/home/gstupp/01_2015_mass_spec/H1_11082014/1108_Gly1_2014_12_15_15_29205/dtaselect_results_sfp0.01_p2/DTASelect-filter.txt'
parser = blazmass_tools.dta_select_parser(ana_DTA, small=True)
ps = [get_domains(p) for p in parser]
set_go = set(chain(*[p['set_go'] for p in ps if p['set_go'] is not None]))
for p in ps:
    if p['set_go']:
        p['go_slim'] = set(
            chain(*[
                mapslim(go_term, go_dag, goslim_meta)[0]
                for go_term in p['set_go'] if go_term in go_dag
            ]))
    else:
        p['go_slim'] = None
go_slim = Counter(chain(*[p['go_slim'] for p in ps if p['go_slim']]))
labels = {
    go_term: go_dag.query_term(go_term).name
    for go_term in go_slim.keys()
}
[labels[go] for (go, x) in go_slim.most_common(n=10)]

import plot_tools

cmap = plt.cm.jet
colors = cmap(np.linspace(0., 1., len(go_slim.keys())))
explode = [1 if x < 20 else 0 for x in go_slim.values()]

fig = plt.figure(figsize=(8, 8))
ax = plt.subplot(111)
patches = ax.pie(list(go_slim.values()),
                 autopct='%1.1f',
                 explode=explode,
Exemple #12
0
class GO2Mongo(object):
    """

    """
    def __init__(self,
                 obo_file="/data/databases/go/go.obo",
                 db="xomeq",
                 client=None,
                 go="ontologies",
                 go_index="col_ont_idx",
                 ontology_name="go",
                 slim_file=None):
        """"""
        self.obo_file = obo_file
        self.slim_file = slim_file
        self.graph = nx.DiGraph()
        self.graph_file = '/data/databases/' + ontology_name + '/' + ontology_name + '.gpickle'
        self.go_dag = None

        if isinstance(db, basestring):
            if not client:
                client = MongoClient()
            self.db = client[db]
        else:
            assert isinstance(db, Database)
            self.db = db

        self.col_go = self.db[go]
        self.col_go_index = self.db[go_index]
        self.ontology_name = ontology_name
        if self.ontology_name == "go":
            self.root_terms = GO_ROOT_TERMS
        else:
            self.root_terms = SO_ROOT_TERMS
        self.ki = KeywordIndexer()

    def init(self):
        _log.debug("Cargando archivo de ontologias:" + self.obo_file)
        self.go_dag = GODag(self.obo_file)
        _log.debug("Se cargo el archivo:" + self.obo_file)

        if os.path.exists(self.graph_file):
            self.graph = nx.read_gpickle(self.graph_file)
        else:
            self._build_graph()
            nx.write_gpickle(self.graph, self.graph_file)

        _log.debug("Se genero el grafo de terminos")

    def add_unknow(self):
        """
        pepe = {
            "_id" : ObjectId("591f14deaab82b7f88ef8c04"),
            "term" : "go:9999999",
            "name" : "Uknown",
            "ontology" : "go",
            "databases" : [ ],
            "description" : "Uknown",
            "keywords" : [ ],
            "parents" : [
                "go:0005575",
                "go:0003674",
                "go:0008150"
            ],
            "children" : [ ],
            "successors" : [ ],
            "subclases" : [ ],
            "successors_relationships" : [ ],
            "database" : "biological_process"
        }
        
        
        db.getCollection('ontologies').save(pepe)
        
        db.proteins.update({organism:"LactoUV",ontologies:{$ne:"go:0008150"}},{$addToSet:{ontologies:"go:9999999"}},{multi:true})
db.ontologies.update({ontology:"go",term:"go:0005575"},{$addToSet:{"subclases":"go:9999999"}})
db.ontologies.update({ontology:"go",term:"go:0005575"},{$addToSet:{"successors":"go:9999999"}})
db.ontologies.update({ontology:"go",term:"go:0005575"},{$addToSet:{"children":"go:9999999"}})


db.ontologies.update({ontology:"go",term:"go:0005575"},{$pull:{"subclases":"go:9999999"}})
db.ontologies.update({ontology:"go",term:"go:0005575"},{$pull:{"successors":"go:9999999"}})
db.ontologies.update({ontology:"go",term:"go:0005575"},{$pull:{"children":"go:9999999"}})

        
        {
    "_id" : ObjectId(""),
    "_cls" : "SeqColOntologyIndex",
    "term" : "go:9999999",
    "name" : "unknown",
    "count" : 800,
    "order" : 27,
    "keywords" : [   ],
    "ontology" : "go",
    "database" : "biological_process",
    "seq_collection_name" : "LactoUV",
    "seq_collection_id" : ObjectId("591caafebe737e774090b78d")
}
        
        
    """

    def load(self):
        self.init()
        self._load_mongo()
        _log.info("Obo %s loaded in %s collection %s and index in %s" %
                  (self.obo_file, self.db.name, self.col_go.name,
                   self.col_go_index.name))

        _log.debug("Loading generic slim")
        self.load_slim()
        _log.info("Generic slim terms loaded")

    def load_slim(self,
                  slim_file="/data/databases/go/goslim_generic.obo",
                  database="generic"):
        parser = GODag(slim_file)
        for ont in parser:
            try:
                go = Ontology.objects(ontology="go", term=ont.lower()).get()
                go.databases.append(database)
                go.save()
            except Exception as ex:
                _log.error(ex)
        go = Ontology.objects(ontology="go", term="root").get()
        go.databases.append(database)
        go.save()

    def _load_mongo(self):
        root = Ontology(ontology=self.ontology_name,
                        term="root",
                        successors=self.root_terms,
                        children=self.root_terms)
        root.save()
        for (node, data) in self.graph.nodes_iter(
                data=True):  # self.graph.add_node(node, **data)
            if node == "root":
                raise Exception("...")
            else:
                successors = self.graph.successors(node)
                _ancestors = self.complete_subgraph([node])

                database = "biological_process"
                if "go:0005575" in _ancestors:
                    database = "cellular_component"
                if "go:0003674" in _ancestors:
                    database = "molecular_function"

                ont_doc = Ontology(
                    ontology=self.ontology_name,
                    term=node,
                    name=data["name"],
                    database=database,
                    successors=self.all_successors(node, []),
                    children=successors,
                    description=self.go_dag.query_term(node.upper()).desc,
                    # successors_relationships=self.successors_relationships(node),
                    subclases=list(
                        set([
                            x.lower() for x in self.go_dag.query_term(
                                node.upper()).get_all_children()
                        ])))
                ont_doc.keywords = self.ki.extract_keywords(
                    [ont_doc.description, ont_doc.name, ont_doc.term])
                ont_doc.save()

    def _build_graph(self):
        assert self.go_dag, "GO terms where not loaded"

        self.graph.add_node("root", name="root")

        processed = []
        for root_term in self.root_terms:  # Iterates over each root
            root = self.go_dag.query_term(root_term)

            self.graph.add_node(root_term.lower(), name=root.name)
            self.graph.add_edge("root", root_term.lower())
            self._load_branch(root, processed)

    def _load_branch(self, term, processed):

        term_id = term.id.lower()
        if term_id in processed: return
        processed.append(term_id)
        if term.children:  # or term.relationships:
            for child in term.children:  # + [x[1] for x in term.relationships]):
                child_id = child.id.lower()

                self.graph.add_node(child_id, name=child.name)
                self.graph.add_edge(term_id, child_id)
                self._load_branch(child, processed)

    def all_successors(self, node, processed):
        successors = self.graph.successors(node)
        if successors:
            for x in successors:
                if x not in processed:
                    processed.append(x)
                    successors = list(
                        set(successors + self.all_successors(x, processed)))
        else:
            successors = list()
        return successors

    def successors_relationships(self, node):
        term = self.go_dag.query_term(node.upper())
        return [[x.id.lower(), "is_a"]
                for x in term.children] + [[x[1].id.lower(), x[0]]
                                           for x in term.relationships]

    def cleanup_cellular_component_annotations(self, genome):
        for ont_doc in Ontology.objects(ontology=self.ontology_name,
                                        database="cellular_component",
                                        databases__ne="generic"):
            # self.db["proteins"].update({"organism":genome, }, {"$pull":{"ontologies":ont_doc.term, "keywords":ont_doc.term}}, multi=True)
            self.db["col_ont_idx"].remove(
                {
                    "ontology": "go",
                    "seq_collection_name": genome.name,
                    "term": ont_doc.term
                },
                multi=True)

    def complete_subgraph(self, ontologies):
        allontologies = copy.copy(ontologies)

        for ontology in ontologies:
            allontologies += self._complete_parents(ontology, [])

        return [x for x in set(allontologies) if x != "root"]

    def _complete_parents(self, ontology, walked):
        allontologies = [ontology]
        walked.append(ontology)
        if ontology in self.graph:
            for ancestor in ancestors(self.graph, ontology):
                if ancestor not in walked:
                    allontologies += self._complete_parents(ancestor, walked)
            return allontologies
        else:
            return allontologies

    #
    def pre_build_index(self,
                        genome,
                        annotated_collection="proteins",
                        annotated_collection_field="ontologies",
                        drop=True):
        if drop:
            print(
                self.col_go_index.remove({
                    "seq_collection_id": genome.id,
                    "ontology": self.ontology_name
                }))

        ont_succ_cache = {}
        for ont_doc in tqdm(
                Ontology.objects(ontology=self.ontology_name).no_cache(),
                total=Ontology.objects(ontology=self.ontology_name).count()):
            ont_succ_cache[ont_doc.term] = ont_doc.successors
            database = ""

            if hasattr(ont_doc, "database") and ont_doc.database:
                database = ont_doc.database
            #             if hasattr(ont_doc, "databases") and ont_doc.databases:
            #                 database = ont_doc.databases[0]
            order = len(ont_doc["children"])

            seq_ont_ont_idx = SeqColOntologyIndex(
                term=ont_doc.term.lower(),
                name=ont_doc.name,
                count=0,
                seq_collection_name=genome.name,
                database=database,
                ontology=self.ontology_name,
                order=order,
                seq_collection_id=genome.id,
                keywords=ont_doc.keywords)
            seq_ont_ont_idx.save()

        ont_count = defaultdict(lambda: 0)
        query = {
            "seq_collection_id": genome.id,
            "ontologies.0": {
                "$exists": True
            }
        }
        for p in tqdm(self.db[annotated_collection].find(
                query, {"ontologies": 1}),
                      total=self.db[annotated_collection].count(query)):
            terms = [x for x in p["ontologies"] if x.startswith("go:")]
            terms = self.complete_subgraph(terms)
            for x in terms:
                ont_count[x] += 1
            self.db[annotated_collection].update(
                {"_id": p["_id"]},
                {"$addToSet": {
                    annotated_collection_field: {
                        "$each": terms
                    }
                }})

        for term, count in tqdm(ont_count.items()):
            for seq_ont_ont_idx in SeqColOntologyIndex.objects(
                    seq_collection_id=genome.id,
                    ontology=self.ontology_name,
                    term=term):
                seq_ont_ont_idx.count = count
                seq_ont_ont_idx.save()

        SeqColOntologyIndex.objects(seq_collection_id=genome.id,
                                    count=0).delete()

        self.cleanup_cellular_component_annotations(genome)