Beispiel #1
0
    def to_html(self, title=None):
        from kobas import config
        from Cheetah.Template import Template

        tmpl = os.path.join(config.getrc()["kobas_home"], "template", "test_html.tmpl")
        self.add_link_to_pathway()
        t = Template(file=tmpl, searchList=[{"thead": self.title, "result": self.result}])
        return str(t)
Beispiel #2
0
 def __init__(self, dbfile="", *args, **kwargs):
     """ init a keggdb object
     """
     if dbfile:
         self.db = sqlite.connect(dbfile, *args, **kwargs)
     else:
         self._rc = config.getrc()
         self.db = sqlite.connect(self._rc['keggdb'], *args, **kwargs)
     self.cursor = self.db.cursor()
Beispiel #3
0
 def __init__(self, dbfile="", *args, **kwargs):
     """ init a keggdb object
     """
     if dbfile:
         self.db = sqlite.connect(dbfile, *args, **kwargs)
     else:
         self._rc = config.getrc()
         self.db = sqlite.connect(self._rc["keggdb"], *args, **kwargs)
     self.cursor = self.db.cursor()
Beispiel #4
0
 def to_html(self, title=None):
     from kobas import config
     from Cheetah.Template import Template
     tmpl = os.path.join(config.getrc()['kobas_home'], "template",
                         "test_html.tmpl")
     self.add_link_to_pathway()
     t = Template(file=tmpl,
                  searchList=[
                      {
                          'thead': self.title,
                          'result': self.result
                      },
                  ])
     return str(t)
Beispiel #5
0
    def requires(self):
        global ref_fa, genome_dir, ref_gtf, annotation_dir, species_latin, species_ensembl, species_kegg, log_dir, ko_pep_dir, ko_db_dir
        ref_fa, ref_gtf, species_latin = self.ref_fa, self.ref_gtf, self.species_latin
        species_kegg, species_ensembl = get_kegg_biomart_id(species_latin)
        genome_dir = path.dirname(ref_fa)
        annotation_dir = path.dirname(ref_gtf)
        log_dir = path.join(annotation_dir, 'logs')
        kobasrc = config.getrc()
        ko_pep_dir = kobasrc['blastdb']
        ko_db_dir = kobasrc['kobasdb']
        circ_mkdir_unix(log_dir)
        print ko_pep_dir

        return [fa_index(), star_index(), go_annotation(), ko_annotation()]
Beispiel #6
0
def arg2dist(arg):
    """ get dist dict in term of arg, abbrev for genome or used defined annot
    """
    kobasrc = config.getrc()
    if len(arg) == 3:
        keggdb = dbutils.keggdb()
        distfile = os.path.join(kobasrc["kobas_home"], "gene_dist",
                                keggdb.get_species_name(arg.lower()))
        dist_dict = discover.dist_from_distfile(open(distfile))
    else:
        if not os.access(arg, os.F_OK):
            print "File: %s not exist" % arg
            sys.exit(0)
        dist_dict = discover.dist_from_annot_file(open(arg, 'r'))
    dist_size = dist_dict.size()
    return (dist_dict, dist_size)
Beispiel #7
0
def distr_from_default(default, db, speciesdb, abbr, small_num, idmapping):
    distr = {}
    if abbr == 'ko':
        koids = speciesdb.kos()
        for koid in koids:
            terms = speciesdb.pathways_from_koid(koid['koid'])
            gids = speciesdb.gids_from_koid(koid['koid'], default)
            for gid in gids:
                if idmapping:
                    bspeciesdb = dbutils.KOBASDB(config.getrc()['kobasdb'] + default + '.db')
                    dblink_ids = [dblink_id[0] for dblink_id in bspeciesdb.dblink_ids_from_gid(gid['gid'], idmapping)]
                for term in terms:
                    if idmapping:
                        distr[(term['name'], db, term['pid'])] =  distr.get((term['name'], db, term['pid']), 0) + len(dblink_ids)
                    else:
                        distr[(term['name'], db, term['pid'])] = distr.get((term['name'], db, term['pid']), 0) + 1

    else:
        gids = speciesdb.genes()
        for gid in gids:
            if db in dbutils.P.keys():
                terms = speciesdb.pathways_from_gid(gid['gid'], db)
            elif db in dbutils.D.keys():
                terms = speciesdb.diseases_from_gid(gid['gid'], db)
            elif db in dbutils.G.keys():
                terms = speciesdb.gos_from_gid(gid['gid'])

            if idmapping:
                dblink_ids = [dblink_id[0] for dblink_id in speciesdb.dblink_ids_from_gid(gid['gid'], idmapping)]

            if db in dbutils.P.keys() + dbutils.D.keys():
                for term in terms:
                    if idmapping:
                        distr[(term['name'], db, term['id'])] = distr.get((term['name'], db, term['id']), 0) + len(dblink_ids)
                    else:
                        distr[(term['name'], db, term['id'])] = distr.get((term['name'], db, term['id']), 0) + 1

            elif db in dbutils.G.keys():
                for term in terms:
                    if idmapping:
                        distr[(term['name'], db, term['goid'])] = distr.get((term['name'], db, term['goid']), 0) + len(dblink_ids)
                    else:
                        distr[(term['name'], db, term['goid'])] = distr.get((term['name'], db, term['goid']), 0) + 1

    remove_small_terms2(distr, small_num)

    return distr
Beispiel #8
0
def arg2dist(arg):
    """ get dist dict in term of arg, abbrev for genome or used defined annot
    """
    kobasrc = config.getrc()
    if len(arg) == 3:
        keggdb = dbutils.keggdb()
        distfile = os.path.join(
            kobasrc["kobas_home"], "gene_dist",
            keggdb.get_species_name(arg.lower()))
        dist_dict = discover.dist_from_distfile(open(distfile))
    else:
        if not os.access(arg, os.F_OK):
            print "File: %s not exist" % arg
            sys.exit(0)
        dist_dict = discover.dist_from_annot_file(open(arg, 'r'))
    dist_size = dist_dict.size()
    return (dist_dict, dist_size)
Beispiel #9
0
import kobas.config as kobas_config
import os

PEP_URL_TEMP = 'ftp://ftp.cbi.pku.edu.cn/pub/KOBAS_3.0_DOWNLOAD/seq_pep/{}.pep.fasta.gz'
DB_URL_TEMP = 'ftp://ftp.cbi.pku.edu.cn/pub/KOBAS_3.0_DOWNLOAD/sqlite3/{}.db.gz'
BLAST_DEFAULT = '-evalue 1e-5 -outfmt 6 -max_target_seqs 1'

KOBASRC = kobas_config.getrc()
BLAST_DIR = KOBASRC['blastout']
PEP_DIR = KOBASRC['blastdb']
DB_DIR = KOBASRC['kobasdb']
KOBAS_RUN = KOBASRC['kobas_run']
KOBAS_PY = KOBASRC['kobas_py']
WHEAT_PEP_DB = os.path.join(BLAST_DIR, 'wheat.pep.fa')
TERM_CAT = os.path.join(BLAST_DIR, 'term.cat.txt')
Beispiel #10
0
def read_gmt_db(gene_list, species, dbtype, idtype, min_size, max_size):

    gset_name = []  #name of gene sets
    gset_des = []  #description of gene sets
    hit_genes = [
    ]  #genes in gene sets none of use, only use genes both in list and gene set
    D = {
        'o': 'OMIM',
        'k': 'KEGG DISEASE',
        'f': 'FunDO'
    }  #don't support GAD and N, because lack of did

    try:
        kobasrc = config.getrc()
        speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + species + '.db')

        dbcatg = dbtype.split(':')[0]
        if not dbcatg in ['P', 'D', 'G']:
            raise exception.GmtdatabaseErr, "Error Message: This database category you input is not allowed. Please choose 'P','D','G' for database category."

        records = []
        gset_num = 0

        if dbcatg == 'P':
            input_dbs = set(dbtype.split(':')[1].split('/'))
            avail_dbs = {}
            for database in speciesdb.pathwaydbs_from_abbr(species):
                avail_dbs[database[0]] = database[1]
            dbs = input_dbs.intersection(set(avail_dbs.keys()))
            if dbs:
                print "Databases: %s" % ', '.join(
                    [avail_dbs[db] for db in dbs])
            else:
                raise exception.GmtdatabaseErr, "No supported databases are selected. Supported databases are %s, but your input databases are: %s." % \
                ('/'.join(avail_dbs.keys()), dbtype)

            for db in dbs:
                tmp = speciesdb.allpathways(db).fetchall()
                num = speciesdb.pathwaynums(db)
                records += tmp
                gset_num += num
            if gset_num == 0:
                raise exception.GmtdatabaseErr, "Error Message: \nFail to get information from gene set database."

            i = 0
            hit_matrix = np.array([[0.0 for s in range(len(gene_list))]
                                   for m in range(gset_num)])
            for record in records:
                hgene = []
                gset_name.append(record[1])  #id
                gset_des.append(record[2])  #name
                gset_gids = speciesdb.genes_from_pid(
                    record[0])  #from table GenePathways
                for gid in gset_gids:
                    #print gid
                    #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0]
                    for each in speciesdb.dblink_ids_from_gid(gid[0], idtype):
                        dblink_id = each[0]
                    if dblink_id in gene_list:
                        hgene.append(dblink_id)
                        j = gene_list.index(dblink_id)
                        hit_matrix[i, j] = 1
                hit_genes.append(hgene)
                i += 1
            if not 1 in hit_matrix:
                raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)."

        elif dbcatg == 'D':
            input_dbs = set(dbtype.split(':')[1].split('/'))
            avail_dbs = {}
            if species != 'hsa':
                raise exception.GmtdatabaseErr, "Error Message: Disease is only supported for H**o Sapiens(-s hsa), not supported for this species. Please choose another database category, e.g. 'P','G'"
            dbs = input_dbs.intersection(set(D.keys()))

            # if dbs:
            #     print  'Databases: %s'  % ', '.join([avail_dbs[db] for db in dbs])
            # else:
            if not dbs:
                raise exception.GmtdatabaseErr, 'No supported databases are selected. Supported databases are %s, but your input databases are: %s.' % \
                ('/'.join(avail_dbs.keys()), dbtype)

            for db in dbs:
                tmp = speciesdb.alldiseases(db).fetchall()
                num = speciesdb.diseasenums(db)
                records += tmp
                gset_num += num
            if gset_num == 0:
                raise exception.GmtdatabaseErr, "Error Message: \nFail to get information from gene set database."

            i = 0
            hit_matrix = np.array([[0.0 for s in range(len(gene_list))]
                                   for m in range(gset_num)])
            for record in records:
                hgene = []
                #gset_name.append(record[1])     #id
                if record[1]:
                    gset_name.append(record[1])
                else:
                    #gset_name.append(record[2])
                    continue
                gset_des.append(record[2])  #name
                gset_gids = speciesdb.genes_from_did(
                    record[0])  #from table GenePathways
                for gid in gset_gids:
                    #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0]
                    for each in speciesdb.dblink_ids_from_gid(gid[0], idtype):
                        dblink_id = each[0]
                    if dblink_id in gene_list:
                        hgene.append(dblink_id)
                        j = gene_list.index(dblink_id)
                        hit_matrix[i, j] = 1
                hit_genes.append(hgene)
                i += 1
            if not 1 in hit_matrix:
                raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)."

        else:
            records = speciesdb.allgoterms().fetchall()
            gset_num = speciesdb.gotermnums()
            i = 0
            hit_matrix = np.array([[0.0 for s in range(len(gene_list))]
                                   for m in range(gset_num)])
            for record in records:
                hgene = []
                gset_name.append(record[0])  #id
                gset_des.append(record[1])  #name
                gset_gids = speciesdb.genes_from_goid(
                    record[0])  #from table GenePathways
                for gid in gset_gids:
                    #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0]
                    dblink_id = []
                    for each in speciesdb.dblink_ids_from_gid(gid[0], idtype):
                        dblink_id += each[0]
                    if dblink_id in gene_list:
                        hgene.append(dblink_id)
                        j = gene_list.index(dblink_id)
                        hit_matrix[i, j] = 1
                hit_genes.append(hgene)
                i += 1
            if not 1 in hit_matrix:
                raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)."

        gset_name = np.array(tuple(gset_name))
        gset_des = np.array(gset_des)
        hit_genes = np.array(hit_genes)
        hitsum = hit_matrix.sum(1)
        delindex = np.where((hitsum < min_size) | (hitsum > max_size))[0]
        hit_matrix_filtered = np.delete(hit_matrix, delindex, axis=0)
        hit_genes_filterd = np.delete(hit_genes, delindex, axis=0)
        hit_sum_filtered = np.delete(hitsum, delindex, axis=0)
        gset_name_filtered = np.delete(np.array(gset_name), delindex, axis=0)
        gset_des_filtered = np.delete(gset_des, delindex, axis=0)
        if len(hit_matrix_filtered) == 0 and len(
                gset_name_filtered) == 0 and len(
                    gset_des_filtered) == 0 and len(hit_genes_filterd) == 0:
            raise exception.GmtvalueErr, "Error Message:\nAll gene sets" + repr(
                len(hit_matrix)
            ) + "have been filtered. \nPlease check the threshold and ceil of gene set size (values of min_size and max_size). "
    except ValueError, e:
        sys.exit(e)
Beispiel #11
0
    for abbr in pathways.keys():
        speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db')

        count = 0
        for pname in pathways[abbr].keys():
            count += 1
            pathways[abbr][pname][0] = count * 10 + 4

            if __name__ == '__main__':
                print pathways[abbr][pname][0], 'R', pathways[abbr][pname][
                    1], pname
            else:
                speciesdb.con.execute('INSERT INTO Pathways VALUES (?, ?, ?, ?)', \
                    (pathways[abbr][pname][0], 'R', pathways[abbr][pname][1], pname))

        for gene_pathway in gene_pathways[abbr]:
            if __name__ == '__main__':
                print gene_pathway[0], pathways[abbr][gene_pathway[1]][0]
            else:
                speciesdb.con.execute('INSERT INTO GenePathways VALUES (?, ?)', \
                    (gene_pathway[0], pathways[abbr][gene_pathway[1]][0]))


if __name__ == '__main__':
    import sys

    from kobas import config

    parse(open(sys.argv[1]), open(sys.argv[2]), config.getrc())
Beispiel #12
0
    for pathway in tuple(pathways):
        if pathway[0] not in pids:
            pathways.remove(pathway)

    pathways = tuple(pathways)
    gene_pathways = tuple(gene_pathways)

    return pathways, gene_pathways


if __name__ == '__main__':
    import sys
    from pprint import pprint

    from kobas import config, dbutils

    pathways, gene_pathways = parse(open(sys.argv[1]), \
        dbutils.KOBASDB(config.getrc()['kobasdb'] + 'hsa.db'))

    print len(pathways), len(gene_pathways)

    g, p = set(), set()
    for gp in gene_pathways:
        g.add(gp[0])
        p.add(gp[1])

    print len(g), len(p)

    pprint(pathways[:5])
    pprint(gene_pathways[:5])
Beispiel #13
0
        help="specify which program to use by blastall, default blastp")
    p.add_option(
        "-r", "--rank", dest="rank", action="store", type="string", 
        help="rank cutoff for valid hit from BLAST, default 5")
    (opt,args) = p.parse_args()
    return (p,opt,args)
    
if __name__ == "__main__":
    opt_parser,opt,args = config_option()

    if len(args) != 1:
        opt_parser.print_help()
        sys.exit(1)

    # Blast environment configuration
    kobasrc = config.getrc()

    # Integrate command line switches into environments
    if opt.rank:
        kobasrc["rank"] = opt.rank
    if opt.evalue:
        kobasrc["evalue"] = opt.evalue

    keggdb = dbutils.keggdb(kobasrc['kobasdb'])
    
    if opt.intype == "fasta":
        # verify fasta file
        try:
            try:
                f = open(args[0])
                fasta.verify(f)
Beispiel #14
0
#!/usr/bin/env python
# -*- coding: ISO-8859-1 -*-
# Copyright by Mao Xizeng ([email protected])
# Created: 2005-03-24 13:14:23
# $Id: kpath.py 465 2008-03-04 18:15:07Z lymxz $

__version__ = '$LastChangedRevision: 465 $'.split()[-2]

"""KEGG pathway mining based on KO
"""

from glob import glob
from os.path import join

from kobas.config import getrc
from kobas.kgml import kgml
from pygraphlib import pygraph, algo, pydot

kg = kgml(join(getrc()['dat_dir'], 'data', 'kgml'))

def dgraph_from_kgml():
    pathways = kg.get_pathways('ot')
    edges = []
    for k, v in pathways.items():
        for l in v.get_linked_pathways():
            edges.append((k, l.name))
    return pygraph.from_list(edges)
    
if __name__ == "__main__":
    pass
Beispiel #15
0
                    gids = bcids[gene_id]
                    for gid in gids:
                        gene_pathways.add((gid, pid))

    pathways = list(pathways)
    gene_pathways = list(gene_pathways)

    return pathways, gene_pathways


if __name__ == '__main__':
    import sys
    from pprint import pprint

    from kobas import config, dbutils

    pathways, gene_pathways = parse(open(sys.argv[1]), open(sys.argv[2]), \
        dbutils.KOBASDB(config.getrc()['kobasdb'] + sys.argv[3] + '.db'))

    print len(pathways), len(gene_pathways)

    g, p = set(), set()
    for gp in gene_pathways:
        g.add(gp[0])
        p.add(gp[1])

    print len(g), len(p)

    pprint(pathways[:5])
    pprint(gene_pathways[:5])
Beispiel #16
0
 def testCheetah(self):
     tmpl = os.path.join(config.getrc()['kobas_home'], 
                         "template", "test_html.tmpl")
     data = {'thead':range(7), 'result':[range(7) for i in range(10)], 'title':'Test'}
     t = Template(file=tmpl, searchList=[data,])
     self.failUnless(str(t).find('$') == -1)
Beispiel #17
0
    gids = [gid[0] for gid in speciesdb.genes(name=False)]

    geid_oeids = get_geid_oeids(orthologydir, abbr)

    for gid in gids:
        geids = speciesdb.ensembl_gene_ids_from_gid(gid)
        for geid in geids:
            if geid_oeids.has_key(geid[0]):
                for oeid, ospecies in geid_oeids[geid[0]]:
                    ospeciedb = dbutils.KOBASDB(kobasrc['kobasdb'] + ospecies +
                                                '.db')
                    oids = ospeciedb.gids_from_ensembl_gene_id(oeid)
                    for oid in oids:
                        orthologs.add((gid, oid[0]))

    return list(orthologs)


if __name__ == '__main__':
    import sys
    from pprint import pprint

    from kobas import config

    kobasrc = config.getrc()
    orthologydir = kobasrc['kobas_home'] + '/orthology/'
    orthologs = parse(kobasrc, orthologydir, 'hsa')

    print len(orthologs)
    pprint(orthologs[:5])
Beispiel #18
0
            ):  ##if 'not' in column 4, discard the annotation
                tid = info[12].split('|')[0][
                    6:]  ##if '|' in column 13, only the first organism encodes the gene or gene product
                if tid_abbrs.has_key(tid):
                    abbr = tid_abbrs[tid]
                    gpid = info[1].split(':')[-1]  ##column 2 is DB Object ID
                    if organism[1] != 'not_need':
                        uniprotkb_acs = gp2protein.get(gpid, [])
                    else:
                        uniprotkb_acs = [gpid]
                    goid = info[4]  ##column 5 is GO ID
                    for uniprotkb_ac in uniprotkb_acs:
                        multi_uniprotkb_ac_gos[abbr].add((uniprotkb_ac, goid))

    return multi_uniprotkb_ac_gos


if __name__ == '__main__':
    import sys
    from pprint import pprint

    from kobas import config

    multi_uniprotkb_ac_gos = parse(
        config.getrc()['kobas_home'],
        ('gene_association.goa_human', 'not_need', '9606', 'hsa'))

    print len(multi_uniprotkb_ac_gos)
    print len(multi_uniprotkb_ac_gos['hsa'])
    pprint(tuple(multi_uniprotkb_ac_gos['hsa'])[:5])