def to_html(self, title=None): from kobas import config from Cheetah.Template import Template tmpl = os.path.join(config.getrc()["kobas_home"], "template", "test_html.tmpl") self.add_link_to_pathway() t = Template(file=tmpl, searchList=[{"thead": self.title, "result": self.result}]) return str(t)
def __init__(self, dbfile="", *args, **kwargs): """ init a keggdb object """ if dbfile: self.db = sqlite.connect(dbfile, *args, **kwargs) else: self._rc = config.getrc() self.db = sqlite.connect(self._rc['keggdb'], *args, **kwargs) self.cursor = self.db.cursor()
def __init__(self, dbfile="", *args, **kwargs): """ init a keggdb object """ if dbfile: self.db = sqlite.connect(dbfile, *args, **kwargs) else: self._rc = config.getrc() self.db = sqlite.connect(self._rc["keggdb"], *args, **kwargs) self.cursor = self.db.cursor()
def to_html(self, title=None): from kobas import config from Cheetah.Template import Template tmpl = os.path.join(config.getrc()['kobas_home'], "template", "test_html.tmpl") self.add_link_to_pathway() t = Template(file=tmpl, searchList=[ { 'thead': self.title, 'result': self.result }, ]) return str(t)
def requires(self): global ref_fa, genome_dir, ref_gtf, annotation_dir, species_latin, species_ensembl, species_kegg, log_dir, ko_pep_dir, ko_db_dir ref_fa, ref_gtf, species_latin = self.ref_fa, self.ref_gtf, self.species_latin species_kegg, species_ensembl = get_kegg_biomart_id(species_latin) genome_dir = path.dirname(ref_fa) annotation_dir = path.dirname(ref_gtf) log_dir = path.join(annotation_dir, 'logs') kobasrc = config.getrc() ko_pep_dir = kobasrc['blastdb'] ko_db_dir = kobasrc['kobasdb'] circ_mkdir_unix(log_dir) print ko_pep_dir return [fa_index(), star_index(), go_annotation(), ko_annotation()]
def arg2dist(arg): """ get dist dict in term of arg, abbrev for genome or used defined annot """ kobasrc = config.getrc() if len(arg) == 3: keggdb = dbutils.keggdb() distfile = os.path.join(kobasrc["kobas_home"], "gene_dist", keggdb.get_species_name(arg.lower())) dist_dict = discover.dist_from_distfile(open(distfile)) else: if not os.access(arg, os.F_OK): print "File: %s not exist" % arg sys.exit(0) dist_dict = discover.dist_from_annot_file(open(arg, 'r')) dist_size = dist_dict.size() return (dist_dict, dist_size)
def distr_from_default(default, db, speciesdb, abbr, small_num, idmapping): distr = {} if abbr == 'ko': koids = speciesdb.kos() for koid in koids: terms = speciesdb.pathways_from_koid(koid['koid']) gids = speciesdb.gids_from_koid(koid['koid'], default) for gid in gids: if idmapping: bspeciesdb = dbutils.KOBASDB(config.getrc()['kobasdb'] + default + '.db') dblink_ids = [dblink_id[0] for dblink_id in bspeciesdb.dblink_ids_from_gid(gid['gid'], idmapping)] for term in terms: if idmapping: distr[(term['name'], db, term['pid'])] = distr.get((term['name'], db, term['pid']), 0) + len(dblink_ids) else: distr[(term['name'], db, term['pid'])] = distr.get((term['name'], db, term['pid']), 0) + 1 else: gids = speciesdb.genes() for gid in gids: if db in dbutils.P.keys(): terms = speciesdb.pathways_from_gid(gid['gid'], db) elif db in dbutils.D.keys(): terms = speciesdb.diseases_from_gid(gid['gid'], db) elif db in dbutils.G.keys(): terms = speciesdb.gos_from_gid(gid['gid']) if idmapping: dblink_ids = [dblink_id[0] for dblink_id in speciesdb.dblink_ids_from_gid(gid['gid'], idmapping)] if db in dbutils.P.keys() + dbutils.D.keys(): for term in terms: if idmapping: distr[(term['name'], db, term['id'])] = distr.get((term['name'], db, term['id']), 0) + len(dblink_ids) else: distr[(term['name'], db, term['id'])] = distr.get((term['name'], db, term['id']), 0) + 1 elif db in dbutils.G.keys(): for term in terms: if idmapping: distr[(term['name'], db, term['goid'])] = distr.get((term['name'], db, term['goid']), 0) + len(dblink_ids) else: distr[(term['name'], db, term['goid'])] = distr.get((term['name'], db, term['goid']), 0) + 1 remove_small_terms2(distr, small_num) return distr
def arg2dist(arg): """ get dist dict in term of arg, abbrev for genome or used defined annot """ kobasrc = config.getrc() if len(arg) == 3: keggdb = dbutils.keggdb() distfile = os.path.join( kobasrc["kobas_home"], "gene_dist", keggdb.get_species_name(arg.lower())) dist_dict = discover.dist_from_distfile(open(distfile)) else: if not os.access(arg, os.F_OK): print "File: %s not exist" % arg sys.exit(0) dist_dict = discover.dist_from_annot_file(open(arg, 'r')) dist_size = dist_dict.size() return (dist_dict, dist_size)
import kobas.config as kobas_config import os PEP_URL_TEMP = 'ftp://ftp.cbi.pku.edu.cn/pub/KOBAS_3.0_DOWNLOAD/seq_pep/{}.pep.fasta.gz' DB_URL_TEMP = 'ftp://ftp.cbi.pku.edu.cn/pub/KOBAS_3.0_DOWNLOAD/sqlite3/{}.db.gz' BLAST_DEFAULT = '-evalue 1e-5 -outfmt 6 -max_target_seqs 1' KOBASRC = kobas_config.getrc() BLAST_DIR = KOBASRC['blastout'] PEP_DIR = KOBASRC['blastdb'] DB_DIR = KOBASRC['kobasdb'] KOBAS_RUN = KOBASRC['kobas_run'] KOBAS_PY = KOBASRC['kobas_py'] WHEAT_PEP_DB = os.path.join(BLAST_DIR, 'wheat.pep.fa') TERM_CAT = os.path.join(BLAST_DIR, 'term.cat.txt')
def read_gmt_db(gene_list, species, dbtype, idtype, min_size, max_size): gset_name = [] #name of gene sets gset_des = [] #description of gene sets hit_genes = [ ] #genes in gene sets none of use, only use genes both in list and gene set D = { 'o': 'OMIM', 'k': 'KEGG DISEASE', 'f': 'FunDO' } #don't support GAD and N, because lack of did try: kobasrc = config.getrc() speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + species + '.db') dbcatg = dbtype.split(':')[0] if not dbcatg in ['P', 'D', 'G']: raise exception.GmtdatabaseErr, "Error Message: This database category you input is not allowed. Please choose 'P','D','G' for database category." records = [] gset_num = 0 if dbcatg == 'P': input_dbs = set(dbtype.split(':')[1].split('/')) avail_dbs = {} for database in speciesdb.pathwaydbs_from_abbr(species): avail_dbs[database[0]] = database[1] dbs = input_dbs.intersection(set(avail_dbs.keys())) if dbs: print "Databases: %s" % ', '.join( [avail_dbs[db] for db in dbs]) else: raise exception.GmtdatabaseErr, "No supported databases are selected. Supported databases are %s, but your input databases are: %s." % \ ('/'.join(avail_dbs.keys()), dbtype) for db in dbs: tmp = speciesdb.allpathways(db).fetchall() num = speciesdb.pathwaynums(db) records += tmp gset_num += num if gset_num == 0: raise exception.GmtdatabaseErr, "Error Message: \nFail to get information from gene set database." i = 0 hit_matrix = np.array([[0.0 for s in range(len(gene_list))] for m in range(gset_num)]) for record in records: hgene = [] gset_name.append(record[1]) #id gset_des.append(record[2]) #name gset_gids = speciesdb.genes_from_pid( record[0]) #from table GenePathways for gid in gset_gids: #print gid #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0] for each in speciesdb.dblink_ids_from_gid(gid[0], idtype): dblink_id = each[0] if dblink_id in gene_list: hgene.append(dblink_id) j = gene_list.index(dblink_id) hit_matrix[i, j] = 1 hit_genes.append(hgene) i += 1 if not 1 in hit_matrix: raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)." elif dbcatg == 'D': input_dbs = set(dbtype.split(':')[1].split('/')) avail_dbs = {} if species != 'hsa': raise exception.GmtdatabaseErr, "Error Message: Disease is only supported for H**o Sapiens(-s hsa), not supported for this species. Please choose another database category, e.g. 'P','G'" dbs = input_dbs.intersection(set(D.keys())) # if dbs: # print 'Databases: %s' % ', '.join([avail_dbs[db] for db in dbs]) # else: if not dbs: raise exception.GmtdatabaseErr, 'No supported databases are selected. Supported databases are %s, but your input databases are: %s.' % \ ('/'.join(avail_dbs.keys()), dbtype) for db in dbs: tmp = speciesdb.alldiseases(db).fetchall() num = speciesdb.diseasenums(db) records += tmp gset_num += num if gset_num == 0: raise exception.GmtdatabaseErr, "Error Message: \nFail to get information from gene set database." i = 0 hit_matrix = np.array([[0.0 for s in range(len(gene_list))] for m in range(gset_num)]) for record in records: hgene = [] #gset_name.append(record[1]) #id if record[1]: gset_name.append(record[1]) else: #gset_name.append(record[2]) continue gset_des.append(record[2]) #name gset_gids = speciesdb.genes_from_did( record[0]) #from table GenePathways for gid in gset_gids: #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0] for each in speciesdb.dblink_ids_from_gid(gid[0], idtype): dblink_id = each[0] if dblink_id in gene_list: hgene.append(dblink_id) j = gene_list.index(dblink_id) hit_matrix[i, j] = 1 hit_genes.append(hgene) i += 1 if not 1 in hit_matrix: raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)." else: records = speciesdb.allgoterms().fetchall() gset_num = speciesdb.gotermnums() i = 0 hit_matrix = np.array([[0.0 for s in range(len(gene_list))] for m in range(gset_num)]) for record in records: hgene = [] gset_name.append(record[0]) #id gset_des.append(record[1]) #name gset_gids = speciesdb.genes_from_goid( record[0]) #from table GenePathways for gid in gset_gids: #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0] dblink_id = [] for each in speciesdb.dblink_ids_from_gid(gid[0], idtype): dblink_id += each[0] if dblink_id in gene_list: hgene.append(dblink_id) j = gene_list.index(dblink_id) hit_matrix[i, j] = 1 hit_genes.append(hgene) i += 1 if not 1 in hit_matrix: raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)." gset_name = np.array(tuple(gset_name)) gset_des = np.array(gset_des) hit_genes = np.array(hit_genes) hitsum = hit_matrix.sum(1) delindex = np.where((hitsum < min_size) | (hitsum > max_size))[0] hit_matrix_filtered = np.delete(hit_matrix, delindex, axis=0) hit_genes_filterd = np.delete(hit_genes, delindex, axis=0) hit_sum_filtered = np.delete(hitsum, delindex, axis=0) gset_name_filtered = np.delete(np.array(gset_name), delindex, axis=0) gset_des_filtered = np.delete(gset_des, delindex, axis=0) if len(hit_matrix_filtered) == 0 and len( gset_name_filtered) == 0 and len( gset_des_filtered) == 0 and len(hit_genes_filterd) == 0: raise exception.GmtvalueErr, "Error Message:\nAll gene sets" + repr( len(hit_matrix) ) + "have been filtered. \nPlease check the threshold and ceil of gene set size (values of min_size and max_size). " except ValueError, e: sys.exit(e)
for abbr in pathways.keys(): speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db') count = 0 for pname in pathways[abbr].keys(): count += 1 pathways[abbr][pname][0] = count * 10 + 4 if __name__ == '__main__': print pathways[abbr][pname][0], 'R', pathways[abbr][pname][ 1], pname else: speciesdb.con.execute('INSERT INTO Pathways VALUES (?, ?, ?, ?)', \ (pathways[abbr][pname][0], 'R', pathways[abbr][pname][1], pname)) for gene_pathway in gene_pathways[abbr]: if __name__ == '__main__': print gene_pathway[0], pathways[abbr][gene_pathway[1]][0] else: speciesdb.con.execute('INSERT INTO GenePathways VALUES (?, ?)', \ (gene_pathway[0], pathways[abbr][gene_pathway[1]][0])) if __name__ == '__main__': import sys from kobas import config parse(open(sys.argv[1]), open(sys.argv[2]), config.getrc())
for pathway in tuple(pathways): if pathway[0] not in pids: pathways.remove(pathway) pathways = tuple(pathways) gene_pathways = tuple(gene_pathways) return pathways, gene_pathways if __name__ == '__main__': import sys from pprint import pprint from kobas import config, dbutils pathways, gene_pathways = parse(open(sys.argv[1]), \ dbutils.KOBASDB(config.getrc()['kobasdb'] + 'hsa.db')) print len(pathways), len(gene_pathways) g, p = set(), set() for gp in gene_pathways: g.add(gp[0]) p.add(gp[1]) print len(g), len(p) pprint(pathways[:5]) pprint(gene_pathways[:5])
help="specify which program to use by blastall, default blastp") p.add_option( "-r", "--rank", dest="rank", action="store", type="string", help="rank cutoff for valid hit from BLAST, default 5") (opt,args) = p.parse_args() return (p,opt,args) if __name__ == "__main__": opt_parser,opt,args = config_option() if len(args) != 1: opt_parser.print_help() sys.exit(1) # Blast environment configuration kobasrc = config.getrc() # Integrate command line switches into environments if opt.rank: kobasrc["rank"] = opt.rank if opt.evalue: kobasrc["evalue"] = opt.evalue keggdb = dbutils.keggdb(kobasrc['kobasdb']) if opt.intype == "fasta": # verify fasta file try: try: f = open(args[0]) fasta.verify(f)
#!/usr/bin/env python # -*- coding: ISO-8859-1 -*- # Copyright by Mao Xizeng ([email protected]) # Created: 2005-03-24 13:14:23 # $Id: kpath.py 465 2008-03-04 18:15:07Z lymxz $ __version__ = '$LastChangedRevision: 465 $'.split()[-2] """KEGG pathway mining based on KO """ from glob import glob from os.path import join from kobas.config import getrc from kobas.kgml import kgml from pygraphlib import pygraph, algo, pydot kg = kgml(join(getrc()['dat_dir'], 'data', 'kgml')) def dgraph_from_kgml(): pathways = kg.get_pathways('ot') edges = [] for k, v in pathways.items(): for l in v.get_linked_pathways(): edges.append((k, l.name)) return pygraph.from_list(edges) if __name__ == "__main__": pass
gids = bcids[gene_id] for gid in gids: gene_pathways.add((gid, pid)) pathways = list(pathways) gene_pathways = list(gene_pathways) return pathways, gene_pathways if __name__ == '__main__': import sys from pprint import pprint from kobas import config, dbutils pathways, gene_pathways = parse(open(sys.argv[1]), open(sys.argv[2]), \ dbutils.KOBASDB(config.getrc()['kobasdb'] + sys.argv[3] + '.db')) print len(pathways), len(gene_pathways) g, p = set(), set() for gp in gene_pathways: g.add(gp[0]) p.add(gp[1]) print len(g), len(p) pprint(pathways[:5]) pprint(gene_pathways[:5])
def testCheetah(self): tmpl = os.path.join(config.getrc()['kobas_home'], "template", "test_html.tmpl") data = {'thead':range(7), 'result':[range(7) for i in range(10)], 'title':'Test'} t = Template(file=tmpl, searchList=[data,]) self.failUnless(str(t).find('$') == -1)
gids = [gid[0] for gid in speciesdb.genes(name=False)] geid_oeids = get_geid_oeids(orthologydir, abbr) for gid in gids: geids = speciesdb.ensembl_gene_ids_from_gid(gid) for geid in geids: if geid_oeids.has_key(geid[0]): for oeid, ospecies in geid_oeids[geid[0]]: ospeciedb = dbutils.KOBASDB(kobasrc['kobasdb'] + ospecies + '.db') oids = ospeciedb.gids_from_ensembl_gene_id(oeid) for oid in oids: orthologs.add((gid, oid[0])) return list(orthologs) if __name__ == '__main__': import sys from pprint import pprint from kobas import config kobasrc = config.getrc() orthologydir = kobasrc['kobas_home'] + '/orthology/' orthologs = parse(kobasrc, orthologydir, 'hsa') print len(orthologs) pprint(orthologs[:5])
): ##if 'not' in column 4, discard the annotation tid = info[12].split('|')[0][ 6:] ##if '|' in column 13, only the first organism encodes the gene or gene product if tid_abbrs.has_key(tid): abbr = tid_abbrs[tid] gpid = info[1].split(':')[-1] ##column 2 is DB Object ID if organism[1] != 'not_need': uniprotkb_acs = gp2protein.get(gpid, []) else: uniprotkb_acs = [gpid] goid = info[4] ##column 5 is GO ID for uniprotkb_ac in uniprotkb_acs: multi_uniprotkb_ac_gos[abbr].add((uniprotkb_ac, goid)) return multi_uniprotkb_ac_gos if __name__ == '__main__': import sys from pprint import pprint from kobas import config multi_uniprotkb_ac_gos = parse( config.getrc()['kobas_home'], ('gene_association.goa_human', 'not_need', '9606', 'hsa')) print len(multi_uniprotkb_ac_gos) print len(multi_uniprotkb_ac_gos['hsa']) pprint(tuple(multi_uniprotkb_ac_gos['hsa'])[:5])