Example #1
0
def insert_orthology(kobasrc, kobasdir):
    organismdb = dbutils.KOBASDB(kobasrc['kobasdb'] + 'organism.db')
    for abbr in organismdb.organisms(name=False):
        #for abbr in [('tru', ), ('xma', )]:
        orthologs = tsv.parse(kobasrc, kobasdir + '/orthology/', abbr[0])
        speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr[0] + '.db')
        speciesdb.con.executemany('INSERT INTO Orthologs VALUES (?, ?)',
                                  orthologs)
Example #2
0
def parse(stid_handle, map_handle, kobasrc):
    pathways, gene_pathways = {}, {}

    stids = {}
    for line in stid_handle:
        uniprotkb_ac, paid, pname = line.split('\t')[:3]
        if not stids.has_key((uniprotkb_ac, pname)):
            stids[(uniprotkb_ac, pname)] = paid

    for line in map_handle:
        uniprotkb_ac, tmp1, pnames, tmp2, sname = line[:-1].split('\t')

        if sname not in reactome_organisms.reactome_organisms.keys():
            # if sname not in ['H**o sapiens', 'Mus musculus']:
            continue
        abbr = reactome_organisms.reactome_organisms[sname]
        speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db')

        gids = speciesdb.gids_from_uniprotkb_ac(uniprotkb_ac)
        pnames = re.sub('\[.*\]: ', '', pnames).split('; ')
        for gid in gids:
            for pname in pnames:
                pname = re.sub(' IEA$', '', pname)
                if stids.has_key((uniprotkb_ac, pname)):
                    paid = stids[(uniprotkb_ac, pname)]
                else:
                    paid = ''

                if not pathways.setdefault(abbr, {}).has_key(pname):
                    pathways[abbr][pname] = [0, paid]
                gene_pathways.setdefault(abbr, set()).add((gid[0], pname))

    for abbr in pathways.keys():
        speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db')

        count = 0
        for pname in pathways[abbr].keys():
            count += 1
            pathways[abbr][pname][0] = count * 10 + 4

            if __name__ == '__main__':
                print pathways[abbr][pname][0], 'R', pathways[abbr][pname][
                    1], pname
            else:
                speciesdb.con.execute('INSERT INTO Pathways VALUES (?, ?, ?, ?)', \
                    (pathways[abbr][pname][0], 'R', pathways[abbr][pname][1], pname))

        for gene_pathway in gene_pathways[abbr]:
            if __name__ == '__main__':
                print gene_pathway[0], pathways[abbr][gene_pathway[1]][0]
            else:
                speciesdb.con.execute('INSERT INTO GenePathways VALUES (?, ?)', \
                    (gene_pathway[0], pathways[abbr][gene_pathway[1]][0]))
Example #3
0
def insert_go(kobasrc, kobasdir):
    go_terms, go_inferreds = obo.parse(open(kobasdir + '/go/gene_ontology.1_2.obo'))

    for organism in go_organisms.go_organisms:
    #for organism in (('gene_association.goa_human', 'not_need', '9606', 'hsa'), \
    #     ('gene_association.mgi', 'gp2protein.mgi', '10090', 'mmu')):
        multi_uniprotkb_ac_gos = association.parse(kobasdir, organism)

        abbrs = [abbr for abbr in organism[3].split('|')]
        for abbr in abbrs:
            speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db')
            gos, gene_gos = set(), set()
            for uniprotkb_ac_go in multi_uniprotkb_ac_gos[abbr]:
                if (uniprotkb_ac_go[1] not in go_categories) and (uniprotkb_ac_go[1] in go_terms.keys()):
                    gids = speciesdb.gids_from_uniprotkb_ac(uniprotkb_ac_go[0])
                    for gid in gids:
                        gene_gos.add((gid[0], uniprotkb_ac_go[1]))
                        gos.add((uniprotkb_ac_go[1], go_terms[uniprotkb_ac_go[1]]))
                        for goid in go_inferreds[uniprotkb_ac_go[1]]:
                            if (goid not in go_categories) and (goid in go_terms.keys()):
                                gene_gos.add((gid[0], goid))
                                gos.add((goid, go_terms[goid]))

            speciesdb.con.executemany('INSERT INTO Gos VALUES (?, ?)', list(gos))
            speciesdb.con.executemany('INSERT INTO GeneGos VALUES (?, ?)', list(gene_gos))
Example #4
0
def parse(handle, kobasrc):
    pathways, gene_pathways = {}, {}

    for line in handle:
        paid, pname, tmp1, tmp2, gene = line.split('\t')[:5]

        sname = gene.split('|')[0]
        if sname not in panther_organisms.panther_organisms.keys():
        # if sname not in ['HUMAN', 'MOUSE']:
            continue
        abbr = panther_organisms.panther_organisms[sname]
        speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db')

        if 'UniProtKB' in gene:
            uniprotkb_ac = gene[gene.find('UniProtKB'):].split('|')[0].split('=')[1]
            gids = speciesdb.gids_from_uniprotkb_ac(uniprotkb_ac)
        elif 'ENTREZ' in gene:
            entrez_gene_id = gene[gene.find('ENTREZ'):].split('|')[0].split('=')[1]
            gids = speciesdb.gids_from_entrez_gene_id(entrez_gene_id)
        else:
            continue

        for gid in gids:
            if not pathways.setdefault(abbr, {}).has_key(pname):
                pathways[abbr][pname] = [0, paid]
            gene_pathways.setdefault(abbr, set()).add((gid[0], pname))

    for abbr in pathways.keys():
        speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db')

        count = 0
        for pname in pathways[abbr].keys():
            count += 1
            pathways[abbr][pname][0] = count * 10 + 6

            if __name__ == '__main__':
                print pathways[abbr][pname][0], 'p', pathways[abbr][pname][1], pname
            else:
                speciesdb.con.execute('INSERT INTO Pathways VALUES (?, ?, ?, ?)', \
                    (pathways[abbr][pname][0], 'p', pathways[abbr][pname][1], pname))

        for gene_pathway in gene_pathways[abbr]:
            if __name__ == '__main__':
                print gene_pathway[0], pathways[abbr][gene_pathway[1]][0]
            else:
                speciesdb.con.execute('INSERT INTO GenePathways VALUES (?, ?)', \
                    (gene_pathway[0], pathways[abbr][gene_pathway[1]][0]))
Example #5
0
def parse(kobasrc, orthologydir, abbr):
    orthologs = set()

    speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db')
    gids = [gid[0] for gid in speciesdb.genes(name=False)]

    geid_oeids = get_geid_oeids(orthologydir, abbr)

    for gid in gids:
        geids = speciesdb.ensembl_gene_ids_from_gid(gid)
        for geid in geids:
            if geid_oeids.has_key(geid[0]):
                for oeid, ospecies in geid_oeids[geid[0]]:
                    ospeciedb = dbutils.KOBASDB(kobasrc['kobasdb'] + ospecies +
                                                '.db')
                    oids = ospeciedb.gids_from_ensembl_gene_id(oeid)
                    for oid in oids:
                        orthologs.add((gid, oid[0]))

    return list(orthologs)
Example #6
0
def distr_from_default(default, db, speciesdb, abbr, small_num, idmapping):
    distr = {}
    if abbr == 'ko':
        koids = speciesdb.kos()
        for koid in koids:
            terms = speciesdb.pathways_from_koid(koid['koid'])
            gids = speciesdb.gids_from_koid(koid['koid'], default)
            for gid in gids:
                if idmapping:
                    bspeciesdb = dbutils.KOBASDB(config.getrc()['kobasdb'] + default + '.db')
                    dblink_ids = [dblink_id[0] for dblink_id in bspeciesdb.dblink_ids_from_gid(gid['gid'], idmapping)]
                for term in terms:
                    if idmapping:
                        distr[(term['name'], db, term['pid'])] =  distr.get((term['name'], db, term['pid']), 0) + len(dblink_ids)
                    else:
                        distr[(term['name'], db, term['pid'])] = distr.get((term['name'], db, term['pid']), 0) + 1

    else:
        gids = speciesdb.genes()
        for gid in gids:
            if db in dbutils.P.keys():
                terms = speciesdb.pathways_from_gid(gid['gid'], db)
            elif db in dbutils.D.keys():
                terms = speciesdb.diseases_from_gid(gid['gid'], db)
            elif db in dbutils.G.keys():
                terms = speciesdb.gos_from_gid(gid['gid'])

            if idmapping:
                dblink_ids = [dblink_id[0] for dblink_id in speciesdb.dblink_ids_from_gid(gid['gid'], idmapping)]

            if db in dbutils.P.keys() + dbutils.D.keys():
                for term in terms:
                    if idmapping:
                        distr[(term['name'], db, term['id'])] = distr.get((term['name'], db, term['id']), 0) + len(dblink_ids)
                    else:
                        distr[(term['name'], db, term['id'])] = distr.get((term['name'], db, term['id']), 0) + 1

            elif db in dbutils.G.keys():
                for term in terms:
                    if idmapping:
                        distr[(term['name'], db, term['goid'])] = distr.get((term['name'], db, term['goid']), 0) + len(dblink_ids)
                    else:
                        distr[(term['name'], db, term['goid'])] = distr.get((term['name'], db, term['goid']), 0) + 1

    remove_small_terms2(distr, small_num)

    return distr
Example #7
0
def read_gmt_db(gene_list, species, dbtype, idtype, min_size, max_size):

    gset_name = []  #name of gene sets
    gset_des = []  #description of gene sets
    hit_genes = [
    ]  #genes in gene sets none of use, only use genes both in list and gene set
    D = {
        'o': 'OMIM',
        'k': 'KEGG DISEASE',
        'f': 'FunDO'
    }  #don't support GAD and N, because lack of did

    try:
        kobasrc = config.getrc()
        speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + species + '.db')

        dbcatg = dbtype.split(':')[0]
        if not dbcatg in ['P', 'D', 'G']:
            raise exception.GmtdatabaseErr, "Error Message: This database category you input is not allowed. Please choose 'P','D','G' for database category."

        records = []
        gset_num = 0

        if dbcatg == 'P':
            input_dbs = set(dbtype.split(':')[1].split('/'))
            avail_dbs = {}
            for database in speciesdb.pathwaydbs_from_abbr(species):
                avail_dbs[database[0]] = database[1]
            dbs = input_dbs.intersection(set(avail_dbs.keys()))
            if dbs:
                print "Databases: %s" % ', '.join(
                    [avail_dbs[db] for db in dbs])
            else:
                raise exception.GmtdatabaseErr, "No supported databases are selected. Supported databases are %s, but your input databases are: %s." % \
                ('/'.join(avail_dbs.keys()), dbtype)

            for db in dbs:
                tmp = speciesdb.allpathways(db).fetchall()
                num = speciesdb.pathwaynums(db)
                records += tmp
                gset_num += num
            if gset_num == 0:
                raise exception.GmtdatabaseErr, "Error Message: \nFail to get information from gene set database."

            i = 0
            hit_matrix = np.array([[0.0 for s in range(len(gene_list))]
                                   for m in range(gset_num)])
            for record in records:
                hgene = []
                gset_name.append(record[1])  #id
                gset_des.append(record[2])  #name
                gset_gids = speciesdb.genes_from_pid(
                    record[0])  #from table GenePathways
                for gid in gset_gids:
                    #print gid
                    #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0]
                    for each in speciesdb.dblink_ids_from_gid(gid[0], idtype):
                        dblink_id = each[0]
                    if dblink_id in gene_list:
                        hgene.append(dblink_id)
                        j = gene_list.index(dblink_id)
                        hit_matrix[i, j] = 1
                hit_genes.append(hgene)
                i += 1
            if not 1 in hit_matrix:
                raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)."

        elif dbcatg == 'D':
            input_dbs = set(dbtype.split(':')[1].split('/'))
            avail_dbs = {}
            if species != 'hsa':
                raise exception.GmtdatabaseErr, "Error Message: Disease is only supported for H**o Sapiens(-s hsa), not supported for this species. Please choose another database category, e.g. 'P','G'"
            dbs = input_dbs.intersection(set(D.keys()))

            # if dbs:
            #     print  'Databases: %s'  % ', '.join([avail_dbs[db] for db in dbs])
            # else:
            if not dbs:
                raise exception.GmtdatabaseErr, 'No supported databases are selected. Supported databases are %s, but your input databases are: %s.' % \
                ('/'.join(avail_dbs.keys()), dbtype)

            for db in dbs:
                tmp = speciesdb.alldiseases(db).fetchall()
                num = speciesdb.diseasenums(db)
                records += tmp
                gset_num += num
            if gset_num == 0:
                raise exception.GmtdatabaseErr, "Error Message: \nFail to get information from gene set database."

            i = 0
            hit_matrix = np.array([[0.0 for s in range(len(gene_list))]
                                   for m in range(gset_num)])
            for record in records:
                hgene = []
                #gset_name.append(record[1])     #id
                if record[1]:
                    gset_name.append(record[1])
                else:
                    #gset_name.append(record[2])
                    continue
                gset_des.append(record[2])  #name
                gset_gids = speciesdb.genes_from_did(
                    record[0])  #from table GenePathways
                for gid in gset_gids:
                    #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0]
                    for each in speciesdb.dblink_ids_from_gid(gid[0], idtype):
                        dblink_id = each[0]
                    if dblink_id in gene_list:
                        hgene.append(dblink_id)
                        j = gene_list.index(dblink_id)
                        hit_matrix[i, j] = 1
                hit_genes.append(hgene)
                i += 1
            if not 1 in hit_matrix:
                raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)."

        else:
            records = speciesdb.allgoterms().fetchall()
            gset_num = speciesdb.gotermnums()
            i = 0
            hit_matrix = np.array([[0.0 for s in range(len(gene_list))]
                                   for m in range(gset_num)])
            for record in records:
                hgene = []
                gset_name.append(record[0])  #id
                gset_des.append(record[1])  #name
                gset_gids = speciesdb.genes_from_goid(
                    record[0])  #from table GenePathways
                for gid in gset_gids:
                    #dblink_id = speciesdb.dblink_ids_from_gid(gid[0], idtype).fetchone()[0]
                    dblink_id = []
                    for each in speciesdb.dblink_ids_from_gid(gid[0], idtype):
                        dblink_id += each[0]
                    if dblink_id in gene_list:
                        hgene.append(dblink_id)
                        j = gene_list.index(dblink_id)
                        hit_matrix[i, j] = 1
                hit_genes.append(hgene)
                i += 1
            if not 1 in hit_matrix:
                raise exception.GmtdatabaseErr, "Error Message:\nNone of genes in input file(gct file) was matched with genes in gene sets. \nPlease check the gct file(-e), the species(-s), the idtype(-i), the database type(-d)."

        gset_name = np.array(tuple(gset_name))
        gset_des = np.array(gset_des)
        hit_genes = np.array(hit_genes)
        hitsum = hit_matrix.sum(1)
        delindex = np.where((hitsum < min_size) | (hitsum > max_size))[0]
        hit_matrix_filtered = np.delete(hit_matrix, delindex, axis=0)
        hit_genes_filterd = np.delete(hit_genes, delindex, axis=0)
        hit_sum_filtered = np.delete(hitsum, delindex, axis=0)
        gset_name_filtered = np.delete(np.array(gset_name), delindex, axis=0)
        gset_des_filtered = np.delete(gset_des, delindex, axis=0)
        if len(hit_matrix_filtered) == 0 and len(
                gset_name_filtered) == 0 and len(
                    gset_des_filtered) == 0 and len(hit_genes_filterd) == 0:
            raise exception.GmtvalueErr, "Error Message:\nAll gene sets" + repr(
                len(hit_matrix)
            ) + "have been filtered. \nPlease check the threshold and ceil of gene set size (values of min_size and max_size). "
    except ValueError, e:
        sys.exit(e)
Example #8
0
PROGRAMS = {'fasta:pro': 'blastp', 'fasta:nuc': 'blastx'}
DBLINKS = {
    'id:ncbigene': 'entrez_gene_id',
    'id:ncbigi': 'gi',
    'id:uniprot': 'uniprotkb_ac',
    'id:ensembl': 'ensembl_gene_id'
}

opt_parser, opt, args = config_option()

# KOBAS environment configuration
kobasrc = config.getrc()

# open KOBASDB

organismdb = dbutils.KOBASDB(kobasrc['kobasdb'] + 'organism.db')
if opt.species:
    speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + opt.species + '.db')

if opt.list:
    if opt.species:
        print 'Available databases for %s:' % opt.species
        databases = speciesdb.databases_from_abbr(opt.species)
        for database in databases:
            print '\t'.join(database)
    else:
        print 'Available species: \nko\tKEGG Orthology'
        species = organismdb.organisms(name=True)
        for specie in species:
            print '\t'.join(specie)
    sys.exit()
Example #9
0
    kobasrc['blastdb'] = opt.kobas_home + '/seq_pep/'
if opt.blast_home:
    kobasrc['blast_home'] = opt.blast_home
    kobasrc['blastp'] = opt.blast_home + '/blastp/'
    kobasrc['blastx'] = opt.blast_home + '/blastx/'
if opt.blastdb:
    kobasrc['blastdb'] = opt.blastdb + '/'
if opt.kobasdb:
    kobasrc['kobasdb'] = opt.kobasdb + '/'
if opt.blastp:
    kobasrc['blastp'] = opt.blastp + '/'
if opt.blastx:
    kobasrc['blastx'] = opt.blastx + '/'

##open KOBASDB
speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + abbr + '.db')

##process opt.db
input_dbs = set(opt.db.split('/'))
avail_dbs = {}
for database in speciesdb.databases_from_abbr(abbr):
    avail_dbs[database[0]] = database[1]
dbs = input_dbs.intersection(set(avail_dbs.keys()))
if dbs:
    print '##Databases: %s' % ', '.join([avail_dbs[db] for db in dbs])
else:
    opt_parser.error('No supported databases are selected. Supported databases are %s, but your input databases are: %s.' % \
        ('/'.join(avail_dbs.keys()), opt.db))

odistr = discover.Distr()
Example #10
0
    for pathway in tuple(pathways):
        if pathway[0] not in pids:
            pathways.remove(pathway)

    pathways = tuple(pathways)
    gene_pathways = tuple(gene_pathways)

    return pathways, gene_pathways


if __name__ == '__main__':
    import sys
    from pprint import pprint

    from kobas import config, dbutils

    pathways, gene_pathways = parse(open(sys.argv[1]), \
        dbutils.KOBASDB(config.getrc()['kobasdb'] + 'hsa.db'))

    print len(pathways), len(gene_pathways)

    g, p = set(), set()
    for gp in gene_pathways:
        g.add(gp[0])
        p.add(gp[1])

    print len(g), len(p)

    pprint(pathways[:5])
    pprint(gene_pathways[:5])
Example #11
0
                    gids = bcids[gene_id]
                    for gid in gids:
                        gene_pathways.add((gid, pid))

    pathways = list(pathways)
    gene_pathways = list(gene_pathways)

    return pathways, gene_pathways


if __name__ == '__main__':
    import sys
    from pprint import pprint

    from kobas import config, dbutils

    pathways, gene_pathways = parse(open(sys.argv[1]), open(sys.argv[2]), \
        dbutils.KOBASDB(config.getrc()['kobasdb'] + sys.argv[3] + '.db'))

    print len(pathways), len(gene_pathways)

    g, p = set(), set()
    for gp in gene_pathways:
        g.add(gp[0])
        p.add(gp[1])

    print len(g), len(p)

    pprint(pathways[:5])
    pprint(gene_pathways[:5])
Example #12
0
#!/usr/bin/env python
import os, time, math
import numpy as np
#import rpy2.robjects as robjects
from itertools import combinations
from kobas import discover, annot, dbutils, config

FILENAME = """/rd1/user/tangkj/trunk/test/result_I"""
kobasrc = config.getrc()
speciesdb = dbutils.KOBASDB(kobasrc['kobasdb'] + 'hsa' + '.db')

COLUMN = ('Term', 'Database', 'ID', 'Input number', 'Background number',
          'P-Value', 'Corrected P-Value', 'Input', 'Hyperlink')
#robjects.r.library('epiR')


def verify_file(file_name):
    ##verify file existence and accession, and return file handle
    if os.access(file_name, os.F_OK):
        return open(file_name)
    else:
        print 'file %s does not exist' % file_name
        sys.exit(1)


def oneTerm(line):
    cont = line.split('\t')
    term = dict(zip(COLUMN, cont))
    return term