def insert_orthology(kobasrc, kobasdir): organismdb = dbutils.KOBASDB(kobasrc['kobasdb'] + 'organism.db') for abbr in organismdb.organisms(name=False): #for abbr in [('tru', ), ('xma', )]: orthologs = tsv.parse(kobasrc, kobasdir + '/orthology/', abbr[0]) speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr[0] + '.db') speciesdb.con.executemany('INSERT INTO Orthologs VALUES (?, ?)', orthologs)
def parse(stid_handle, map_handle, kobasrc): pathways, gene_pathways = {}, {} stids = {} for line in stid_handle: uniprotkb_ac, paid, pname = line.split('\t')[:3] if not stids.has_key((uniprotkb_ac, pname)): stids[(uniprotkb_ac, pname)] = paid for line in map_handle: uniprotkb_ac, tmp1, pnames, tmp2, sname = line[:-1].split('\t') if sname not in reactome_organisms.reactome_organisms.keys(): # if sname not in ['H**o sapiens', 'Mus musculus']: continue abbr = reactome_organisms.reactome_organisms[sname] speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db') gids = speciesdb.gids_from_uniprotkb_ac(uniprotkb_ac) pnames = re.sub('\[.*\]: ', '', pnames).split('; ') for gid in gids: for pname in pnames: pname = re.sub(' IEA$', '', pname) if stids.has_key((uniprotkb_ac, pname)): paid = stids[(uniprotkb_ac, pname)] else: paid = '' if not pathways.setdefault(abbr, {}).has_key(pname): pathways[abbr][pname] = [0, paid] gene_pathways.setdefault(abbr, set()).add((gid[0], pname)) for abbr in pathways.keys(): speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db') count = 0 for pname in pathways[abbr].keys(): count += 1 pathways[abbr][pname][0] = count * 10 + 4 if __name__ == '__main__': print pathways[abbr][pname][0], 'R', pathways[abbr][pname][ 1], pname else: speciesdb.con.execute('INSERT INTO Pathways VALUES (?, ?, ?, ?)', \ (pathways[abbr][pname][0], 'R', pathways[abbr][pname][1], pname)) for gene_pathway in gene_pathways[abbr]: if __name__ == '__main__': print gene_pathway[0], pathways[abbr][gene_pathway[1]][0] else: speciesdb.con.execute('INSERT INTO GenePathways VALUES (?, ?)', \ (gene_pathway[0], pathways[abbr][gene_pathway[1]][0]))
def insert_go(kobasrc, kobasdir): go_terms, go_inferreds = obo.parse(open(kobasdir + '/go/gene_ontology.1_2.obo')) for organism in go_organisms.go_organisms: #for organism in (('gene_association.goa_human', 'not_need', '9606', 'hsa'), \ # ('gene_association.mgi', 'gp2protein.mgi', '10090', 'mmu')): multi_uniprotkb_ac_gos = association.parse(kobasdir, organism) abbrs = [abbr for abbr in organism[3].split('|')] for abbr in abbrs: speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db') gos, gene_gos = set(), set() for uniprotkb_ac_go in multi_uniprotkb_ac_gos[abbr]: if (uniprotkb_ac_go[1] not in go_categories) and (uniprotkb_ac_go[1] in go_terms.keys()): gids = speciesdb.gids_from_uniprotkb_ac(uniprotkb_ac_go[0]) for gid in gids: gene_gos.add((gid[0], uniprotkb_ac_go[1])) gos.add((uniprotkb_ac_go[1], go_terms[uniprotkb_ac_go[1]])) for goid in go_inferreds[uniprotkb_ac_go[1]]: if (goid not in go_categories) and (goid in go_terms.keys()): gene_gos.add((gid[0], goid)) gos.add((goid, go_terms[goid])) speciesdb.con.executemany('INSERT INTO Gos VALUES (?, ?)', list(gos)) speciesdb.con.executemany('INSERT INTO GeneGos VALUES (?, ?)', list(gene_gos))
def parse(handle, kobasrc): pathways, gene_pathways = {}, {} for line in handle: paid, pname, tmp1, tmp2, gene = line.split('\t')[:5] sname = gene.split('|')[0] if sname not in panther_organisms.panther_organisms.keys(): # if sname not in ['HUMAN', 'MOUSE']: continue abbr = panther_organisms.panther_organisms[sname] speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db') if 'UniProtKB' in gene: uniprotkb_ac = gene[gene.find('UniProtKB'):].split('|')[0].split('=')[1] gids = speciesdb.gids_from_uniprotkb_ac(uniprotkb_ac) elif 'ENTREZ' in gene: entrez_gene_id = gene[gene.find('ENTREZ'):].split('|')[0].split('=')[1] gids = speciesdb.gids_from_entrez_gene_id(entrez_gene_id) else: continue for gid in gids: if not pathways.setdefault(abbr, {}).has_key(pname): pathways[abbr][pname] = [0, paid] gene_pathways.setdefault(abbr, set()).add((gid[0], pname)) for abbr in pathways.keys(): speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db') count = 0 for pname in pathways[abbr].keys(): count += 1 pathways[abbr][pname][0] = count * 10 + 6 if __name__ == '__main__': print pathways[abbr][pname][0], 'p', pathways[abbr][pname][1], pname else: speciesdb.con.execute('INSERT INTO Pathways VALUES (?, ?, ?, ?)', \ (pathways[abbr][pname][0], 'p', pathways[abbr][pname][1], pname)) for gene_pathway in gene_pathways[abbr]: if __name__ == '__main__': print gene_pathway[0], pathways[abbr][gene_pathway[1]][0] else: speciesdb.con.execute('INSERT INTO GenePathways VALUES (?, ?)', \ (gene_pathway[0], pathways[abbr][gene_pathway[1]][0]))
def parse(kobasrc, orthologydir, abbr): orthologs = set() speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db') gids = [gid[0] for gid in speciesdb.genes(name=False)] geid_oeids = get_geid_oeids(orthologydir, abbr) for gid in gids: geids = speciesdb.ensembl_gene_ids_from_gid(gid) for geid in geids: if geid_oeids.has_key(geid[0]): for oeid, ospecies in geid_oeids[geid[0]]: ospeciedb = dbutils.KOBASDB(kobasrc['kobasdb'] + ospecies + '.db') oids = ospeciedb.gids_from_ensembl_gene_id(oeid) for oid in oids: orthologs.add((gid, oid[0])) return list(orthologs)
def distr_from_default(default, db, speciesdb, abbr, small_num, idmapping): distr = {} if abbr == 'ko': koids = speciesdb.kos() for koid in koids: terms = speciesdb.pathways_from_koid(koid['koid']) gids = speciesdb.gids_from_koid(koid['koid'], default) for gid in gids: if idmapping: bspeciesdb = dbutils.KOBASDB(config.getrc()['kobasdb'] + default + '.db') dblink_ids = [dblink_id[0] for dblink_id in bspeciesdb.dblink_ids_from_gid(gid['gid'], idmapping)] for term in terms: if idmapping: distr[(term['name'], db, term['pid'])] = distr.get((term['name'], db, term['pid']), 0) + len(dblink_ids) else: distr[(term['name'], db, term['pid'])] = distr.get((term['name'], db, term['pid']), 0) + 1 else: gids = speciesdb.genes() for gid in gids: if db in dbutils.P.keys(): terms = speciesdb.pathways_from_gid(gid['gid'], db) elif db in dbutils.D.keys(): terms = speciesdb.diseases_from_gid(gid['gid'], db) elif db in dbutils.G.keys(): terms = speciesdb.gos_from_gid(gid['gid']) if idmapping: dblink_ids = [dblink_id[0] for dblink_id in speciesdb.dblink_ids_from_gid(gid['gid'], idmapping)] if db in dbutils.P.keys() + dbutils.D.keys(): for term in terms: if idmapping: distr[(term['name'], db, term['id'])] = distr.get((term['name'], db, term['id']), 0) + len(dblink_ids) else: distr[(term['name'], db, term['id'])] = distr.get((term['name'], db, term['id']), 0) + 1 elif db in dbutils.G.keys(): for term in terms: if idmapping: distr[(term['name'], db, term['goid'])] = distr.get((term['name'], db, term['goid']), 0) + len(dblink_ids) else: distr[(term['name'], db, term['goid'])] = distr.get((term['name'], db, term['goid']), 0) + 1 remove_small_terms2(distr, small_num) return distr
def read_gmt_db(gene_list, species, dbtype, idtype, min_size, max_size): gset_name = [] #name of gene sets gset_des = [] #description of gene sets hit_genes = [ ] #genes in gene sets none of use, only use genes both in list and gene set D = { 'o': 'OMIM', 'k': 'KEGG DISEASE', 'f': 'FunDO' } #don't support GAD and N, because lack of did try: kobasrc = config.getrc() speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + species + '.db') dbcatg = dbtype.split(':')[0] if not dbcatg in ['P', 'D', 'G']: raise exception.GmtdatabaseErr, "Error Message: This database category you input is not allowed. Please choose 'P','D','G' for database category." records = [] gset_num = 0 if dbcatg == 'P': input_dbs = set(dbtype.split(':')[1].split('/')) avail_dbs = {} for database in speciesdb.pathwaydbs_from_abbr(species): avail_dbs[database[0]] = database[1] dbs = input_dbs.intersection(set(avail_dbs.keys())) if dbs: print "Databases: %s" % ', '.join( [avail_dbs[db] for db in dbs]) else: raise exception.GmtdatabaseErr, "No supported databases are selected. Supported databases are %s, but your input databases are: %s." % \ ('/'.join(avail_dbs.keys()), dbtype) for db in dbs: tmp = speciesdb.allpathways(db).fetchall() num = speciesdb.pathwaynums(db) records += tmp gset_num += num if gset_num == 0: raise exception.GmtdatabaseErr, "Error Message: \nFail to get information from gene set database." i = 0 hit_matrix = np.array([[0.0 for s in range(len(gene_list))] for m in range(gset_num)]) for record in records: hgene = [] gset_name.append(record[1]) #id gset_des.append(record[2]) #name gset_gids = speciesdb.genes_from_pid( record[0]) #from table GenePathways for gid in gset_gids: #print gid #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0] for each in speciesdb.dblink_ids_from_gid(gid[0], idtype): dblink_id = each[0] if dblink_id in gene_list: hgene.append(dblink_id) j = gene_list.index(dblink_id) hit_matrix[i, j] = 1 hit_genes.append(hgene) i += 1 if not 1 in hit_matrix: raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)." elif dbcatg == 'D': input_dbs = set(dbtype.split(':')[1].split('/')) avail_dbs = {} if species != 'hsa': raise exception.GmtdatabaseErr, "Error Message: Disease is only supported for H**o Sapiens(-s hsa), not supported for this species. Please choose another database category, e.g. 'P','G'" dbs = input_dbs.intersection(set(D.keys())) # if dbs: # print 'Databases: %s' % ', '.join([avail_dbs[db] for db in dbs]) # else: if not dbs: raise exception.GmtdatabaseErr, 'No supported databases are selected. Supported databases are %s, but your input databases are: %s.' % \ ('/'.join(avail_dbs.keys()), dbtype) for db in dbs: tmp = speciesdb.alldiseases(db).fetchall() num = speciesdb.diseasenums(db) records += tmp gset_num += num if gset_num == 0: raise exception.GmtdatabaseErr, "Error Message: \nFail to get information from gene set database." i = 0 hit_matrix = np.array([[0.0 for s in range(len(gene_list))] for m in range(gset_num)]) for record in records: hgene = [] #gset_name.append(record[1]) #id if record[1]: gset_name.append(record[1]) else: #gset_name.append(record[2]) continue gset_des.append(record[2]) #name gset_gids = speciesdb.genes_from_did( record[0]) #from table GenePathways for gid in gset_gids: #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0] for each in speciesdb.dblink_ids_from_gid(gid[0], idtype): dblink_id = each[0] if dblink_id in gene_list: hgene.append(dblink_id) j = gene_list.index(dblink_id) hit_matrix[i, j] = 1 hit_genes.append(hgene) i += 1 if not 1 in hit_matrix: raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)." else: records = speciesdb.allgoterms().fetchall() gset_num = speciesdb.gotermnums() i = 0 hit_matrix = np.array([[0.0 for s in range(len(gene_list))] for m in range(gset_num)]) for record in records: hgene = [] gset_name.append(record[0]) #id gset_des.append(record[1]) #name gset_gids = speciesdb.genes_from_goid( record[0]) #from table GenePathways for gid in gset_gids: #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0] dblink_id = [] for each in speciesdb.dblink_ids_from_gid(gid[0], idtype): dblink_id += each[0] if dblink_id in gene_list: hgene.append(dblink_id) j = gene_list.index(dblink_id) hit_matrix[i, j] = 1 hit_genes.append(hgene) i += 1 if not 1 in hit_matrix: raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)." gset_name = np.array(tuple(gset_name)) gset_des = np.array(gset_des) hit_genes = np.array(hit_genes) hitsum = hit_matrix.sum(1) delindex = np.where((hitsum < min_size) | (hitsum > max_size))[0] hit_matrix_filtered = np.delete(hit_matrix, delindex, axis=0) hit_genes_filterd = np.delete(hit_genes, delindex, axis=0) hit_sum_filtered = np.delete(hitsum, delindex, axis=0) gset_name_filtered = np.delete(np.array(gset_name), delindex, axis=0) gset_des_filtered = np.delete(gset_des, delindex, axis=0) if len(hit_matrix_filtered) == 0 and len( gset_name_filtered) == 0 and len( gset_des_filtered) == 0 and len(hit_genes_filterd) == 0: raise exception.GmtvalueErr, "Error Message:\nAll gene sets" + repr( len(hit_matrix) ) + "have been filtered. \nPlease check the threshold and ceil of gene set size (values of min_size and max_size). " except ValueError, e: sys.exit(e)
PROGRAMS = {'fasta:pro': 'blastp', 'fasta:nuc': 'blastx'} DBLINKS = { 'id:ncbigene': 'entrez_gene_id', 'id:ncbigi': 'gi', 'id:uniprot': 'uniprotkb_ac', 'id:ensembl': 'ensembl_gene_id' } opt_parser, opt, args = config_option() # KOBAS environment configuration kobasrc = config.getrc() # open KOBASDB organismdb = dbutils.KOBASDB(kobasrc['kobasdb'] + 'organism.db') if opt.species: speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + opt.species + '.db') if opt.list: if opt.species: print 'Available databases for %s:' % opt.species databases = speciesdb.databases_from_abbr(opt.species) for database in databases: print '\t'.join(database) else: print 'Available species: \nko\tKEGG Orthology' species = organismdb.organisms(name=True) for specie in species: print '\t'.join(specie) sys.exit()
kobasrc['blastdb'] = opt.kobas_home + '/seq_pep/' if opt.blast_home: kobasrc['blast_home'] = opt.blast_home kobasrc['blastp'] = opt.blast_home + '/blastp/' kobasrc['blastx'] = opt.blast_home + '/blastx/' if opt.blastdb: kobasrc['blastdb'] = opt.blastdb + '/' if opt.kobasdb: kobasrc['kobasdb'] = opt.kobasdb + '/' if opt.blastp: kobasrc['blastp'] = opt.blastp + '/' if opt.blastx: kobasrc['blastx'] = opt.blastx + '/' ##open KOBASDB speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db') ##process opt.db input_dbs = set(opt.db.split('/')) avail_dbs = {} for database in speciesdb.databases_from_abbr(abbr): avail_dbs[database[0]] = database[1] dbs = input_dbs.intersection(set(avail_dbs.keys())) if dbs: print '##Databases: %s' % ', '.join([avail_dbs[db] for db in dbs]) else: opt_parser.error('No supported databases are selected. Supported databases are %s, but your input databases are: %s.' % \ ('/'.join(avail_dbs.keys()), opt.db)) odistr = discover.Distr()
for pathway in tuple(pathways): if pathway[0] not in pids: pathways.remove(pathway) pathways = tuple(pathways) gene_pathways = tuple(gene_pathways) return pathways, gene_pathways if __name__ == '__main__': import sys from pprint import pprint from kobas import config, dbutils pathways, gene_pathways = parse(open(sys.argv[1]), \ dbutils.KOBASDB(config.getrc()['kobasdb'] + 'hsa.db')) print len(pathways), len(gene_pathways) g, p = set(), set() for gp in gene_pathways: g.add(gp[0]) p.add(gp[1]) print len(g), len(p) pprint(pathways[:5]) pprint(gene_pathways[:5])
gids = bcids[gene_id] for gid in gids: gene_pathways.add((gid, pid)) pathways = list(pathways) gene_pathways = list(gene_pathways) return pathways, gene_pathways if __name__ == '__main__': import sys from pprint import pprint from kobas import config, dbutils pathways, gene_pathways = parse(open(sys.argv[1]), open(sys.argv[2]), \ dbutils.KOBASDB(config.getrc()['kobasdb'] + sys.argv[3] + '.db')) print len(pathways), len(gene_pathways) g, p = set(), set() for gp in gene_pathways: g.add(gp[0]) p.add(gp[1]) print len(g), len(p) pprint(pathways[:5]) pprint(gene_pathways[:5])
#!/usr/bin/env python import os, time, math import numpy as np #import rpy2.robjects as robjects from itertools import combinations from kobas import discover, annot, dbutils, config FILENAME = """/rd1/user/tangkj/trunk/test/result_I""" kobasrc = config.getrc() speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + 'hsa' + '.db') COLUMN = ('Term', 'Database', 'ID', 'Input number', 'Background number', 'P-Value', 'Corrected P-Value', 'Input', 'Hyperlink') #robjects.r.library('epiR') def verify_file(file_name): ##verify file existence and accession, and return file handle if os.access(file_name, os.F_OK): return open(file_name) else: print 'file %s does not exist' % file_name sys.exit(1) def oneTerm(line): cont = line.split('\t') term = dict(zip(COLUMN, cont)) return term