예제 #1
0
def locus2ko_table_legacy(biodatabase, hash2ko, hash2locus_list):
    # create legacy table locus2ko
    # TODO: remove all depenancies to this table
    # content:
    '''
    taxon_id, locus_tag, orthogroup, ko_id (not numerical id but KO*)
    '''

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodatabase)

    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodatabase
    sql2 = 'select locus_tag, orthogroup from orthology_detail_%s' % biodatabase

    locus2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql))
    locus2orthogroup = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql2))

    sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.locus2ko_%s (taxon_id INT,'\
           ' locus_tag VARCHAR(200),' \
           ' orthogroup varchar(200),' \
           ' ko_id VARCHAR(200), index taxon_id (taxon_id), index ko_id (ko_id));' % (biodatabase)

    server.adaptor.execute_and_fetchall(sql2, )
    for hash in hash2ko:
        for locus in hash2locus_list[hash]:
            ko = hash2ko[hash]["KO"]

            sql = 'insert into enzyme.locus2ko_%s (taxon_id, locus_tag, orthogroup, ko_id) values ("%s", "%s", "%s", "%s")' % (
                biodatabase, locus2taxon_id[locus], locus,
                locus2orthogroup[locus], ko)

            server.adaptor.execute(sql, )
        server.commit()
예제 #2
0
def blastswiss2biosql(locus_tag2seqfeature_id, db_name, n_procs, mysql_host,
                      mysql_user, mysql_pwd, mysql_db, *input_blast_files):

    import numpy
    from multiprocessing import Process

    create_sql_blast_swissprot_tables(db_name, mysql_host, mysql_user,
                                      mysql_pwd, mysql_db)

    # load blast data into blastnr_ db_name table
    n_cpu = n_procs
    n_poc_per_list = int(numpy.ceil(len(input_blast_files) / float(n_cpu)))
    query_lists = _chunks(input_blast_files, n_poc_per_list)
    print('n lists: %s' % len(query_lists))



    sql = 'create table if not exists blastnr.gi2taxon_and_description (gi INT, taxon_id INT, description TEXT,' \
          ' taxonomy TEXT, source TEXT);'
    server.adaptor.execute(sql, )

    print('get locus2taxon_id')
    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % db_name
    locus_tag2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))
    print('get locus2bioentry')
    sql2 = 'select locus_tag,bioentry_id from biodatabase t1 ' \
           ' inner join bioentry as t2 on t1.biodatabase_id=t2.biodatabase_id' \
           ' inner join orthology_detail_%s t3 on t2.accession=t3.accession where t1.name="%s"' % (db_name,db_name)

    locus_tag2bioentry_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql2, ))

    if len(input_blast_files) > n_poc_per_list:

        procs = []
        for one_list in query_lists:
            proc = Process(target=load_blastswissprot_file_into_db,
                           args=(locus_tag2taxon_id, locus_tag2seqfeature_id,
                                 locus_tag2bioentry_id, mysql_host, mysql_user,
                                 mysql_pwd, mysql_db, one_list, biodb))
            procs.append(proc)
            proc.start()

        # Wait for all worker processes to finish
        for proc in procs:
            proc.join()
    else:
        print(
            'Only %s input blast file(s) for %s cpus, not working in paralell mode'
            % (len(input_blast_files), n_poc_per_list))
        load_blastswissprot_file_into_db(locus_tag2taxon_id,
                                         locus_tag2seqfeature_id,
                                         locus_tag2bioentry_id, mysql_host,
                                         mysql_user, mysql_pwd, mysql_db,
                                         input_blast_files, biodb)

    sys.stdout.write("done!")
예제 #3
0
def load_PMID(PMID_db_path, hash2locus_list, db_name):

    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.biosqldb import biosql_own_sql_tables
    import sqlite3

    sqlite_conn = sqlite3.connect(PMID_db_path)
    sqlite_cursor = sqlite_conn.cursor()

    server, db = manipulate_biosqldb.load_db(db_name)

    sql1 = 'create table string.seqfeature_id2pmid_%s (seqfeature_id INTEGER, pmid INTEGER);' % db_name
    sql2 = 'create table if not exists string.pmid2data (pmid INTEGER, title TEXT, authors TEXT, source TEXT, abstract TEXT);'

    server.adaptor.execute(sql1, )
    server.adaptor.execute(sql2, )

    # get locus_tag2seqfeaute_id dictionnary
    sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % db_name
    locus_tag2seqfeaure_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))
    # retrieve existing pmid data (if any)
    sql = 'select pmid from string.pmid2data '
    pmid_already_in_db = [
        str(i[0]) for i in server.adaptor.execute_and_fetchall(sql, )
    ]

    sql = 'select * from hash2pmid'
    hash_and_pmid = sqlite_cursor.execute(sql, ).fetchall()

    sql = 'select * from pmid2data'
    pmid2data = manipulate_biosqldb.to_dict(
        sqlite_cursor.execute(sql, ).fetchall())

    template1 = 'insert into string.pmid2data values (%s, %s, %s, %s, %s)'
    template2 = 'insert into string.seqfeature_id2pmid_%s' % db_name + ' values(%s, %s)'
    for n, row in enumerate(hash_and_pmid):
        hash, pmid = row
        locus_list = hash2locus_list[hash]
        if pmid not in pmid_already_in_db:
            try:
                server.adaptor.execute(template1,
                                       [pmid] + list(pmid2data[str(pmid)]))
                pmid_already_in_db.append(pmid)
            except KeyError:
                print("Problem with pmid %s" % pmid)
        for locus_tag in locus_list:
            server.adaptor.execute(template2,
                                   [locus_tag2seqfeaure_id[locus_tag], pmid])
    server.commit()

    sql1 = 'create index sf on string.seqfeature_id2pmid_%s(seqfeature_id)' % db_name
    sql2 = 'create index pm1 on string.seqfeature_id2pmid_%s(pmid)' % db_name
    sql3 = 'create index pm2 on string.pmid2data (pmid)'

    server.adaptor.execute(sql1, )
    server.adaptor.execute(sql2, )
    server.adaptor.execute(sql3, )
예제 #4
0
def get_species_data(server,
                     biodb):
    
    from chlamdb.biosqldb import manipulate_biosqldb

    """
    for each species, report 
    
    - number of complete genomes (1 contig)
    - number of draft genomes (>1 contig)
    - completeness range
 
    - ideally distinguish metagenomes from cultured representatives
    """

    sql1 = f'''
            select t3.species,count(*) as n_complete from biosqldb.bioentry t1
            inner join biosqldb.taxid2species_{biodb} t2 on t1.taxon_id=t2.taxon_id
            inner join biosqldb.species_curated_taxonomy_{biodb} t3 on t2.species_id=t3.species_id
            inner join biosqldb.genomes_info_{biodb} t4 on t1.accession=t4.ACCESSION where t4.n_contigs=1 and t1.description not like "%%%%plasmid%%%%" group by species;
            '''
    sql2 = f'''
            select t3.species,count(*) as n_incomplete from biosqldb.bioentry t1
            inner join biosqldb.taxid2species_{biodb} t2 on t1.taxon_id=t2.taxon_id
            inner join biosqldb.species_curated_taxonomy_{biodb} t3 on t2.species_id=t3.species_id
            inner join biosqldb.genomes_info_{biodb} t4 on t1.accession=t4.ACCESSION where t4.n_contigs>1 and t1.description not like "%%%%plasmid%%%%" group by species;
            '''

    sql3 = f'''
            select t3.species, t5.completeness,t4.n_contigs from biosqldb.bioentry t1
            inner join biosqldb.taxid2species_2019_06_PVC t2 on t1.taxon_id=t2.taxon_id
            inner join biosqldb.species_curated_taxonomy_2019_06_PVC t3 on t2.species_id=t3.species_id
            inner join biosqldb.genomes_info_2019_06_PVC t4 on t1.accession=t4.ACCESSION
            inner join custom_tables.checkm_2019_06_PVC t5 on t1.taxon_id=t5.taxon_id where t1.description not like "%%%%plasmid%%%%";
            '''

    species2n_complete_genomes = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,))
    species2n_draft_genomes = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,))
    
    species2completeness = {}
    
    for row in server.adaptor.execute_and_fetchall(sql3,):
        species, completeness, n_contigs = row 
        completeness = float(completeness)
        if int(n_contigs) == 1:
            continue
        if species not in species2completeness:
            species2completeness[species] = [completeness, completeness]
        else:
            if completeness < species2completeness[species][0]:
                print("smaller!")
                species2completeness[species][0] = completeness
            if completeness > species2completeness[species][1]:
                print("Larger")
                species2completeness[species][1] = completeness
    print(species2completeness)
    return species2n_complete_genomes, species2n_draft_genomes, species2completeness
예제 #5
0
def biodb2pairwise_dist_phylogenies(biodb):

    from ete2 import Tree
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select phylogeny from biosqldb_phylogenies.%s' % biodb
    all_phylogenies = [
        i[0] for i in server.adaptor.execute_and_fetchall(sql, )
    ]

    sql = 'select seqfeature_id, taxon_id from custom_tables.locus2seqfeature_id_%s' % biodb
    seqfeature_id2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'create table if not exists biosqldb_phylogenies.pairwise_leaf_dist_%s (taxon_1 INT, ' \
          ' taxon_2 INT, ' \
          ' seqfeature_1 INT, ' \
          ' seqfeature_2 INT, ' \
          ' distance FLOAT, ' \
          ' index taxon_1(taxon_1), ' \
          ' index taxon_2(taxon_2), ' \
          ' index seqfeature_1(seqfeature_1),' \
          ' index seqfeature_2(seqfeature_2), ' \
          ' index distance(distance))' % (biodb)

    server.adaptor.execute(sql, )
    for c, phylogeny in enumerate(all_phylogenies):

        t = Tree(phylogeny)
        leaf_list = [i for i in t.iter_leaves()]
        print '%s / %s -- %s' % (c, len(all_phylogenies), len(leaf_list))
        for n, l1 in enumerate(leaf_list):
            #print "%s / %s" % (n, len(leaf_list))
            for l2 in leaf_list[n + 1:]:
                seqid_1 = locus_tag2seqfeature_id[str(l1.name)]
                seqid_2 = locus_tag2seqfeature_id[str(l2.name)]
                taxon_1 = seqfeature_id2taxon_id[str(seqid_1)]
                taxon_2 = seqfeature_id2taxon_id[str(seqid_2)]

                sql = 'insert into biosqldb_phylogenies.pairwise_leaf_dist_%s values (%s, %s, %s, %s, %s)' % (
                    biodb, taxon_1, taxon_2, seqid_1, seqid_2,
                    l1.get_distance(l2))
                server.adaptor.execute(sql, )
        server.commit()
예제 #6
0
def load_locus2cog_into_sqldb_legacy(input_blast_files, biodb, cdd_id2cog_id,
                                     hash2locus_tag_list):
    import MySQLdb
    import os
    from chlamdb.biosqldb import manipulate_biosqldb

    mysql_host = 'localhost'
    mysql_user = '******'
    mysql_pwd = os.environ['SQLPSW']
    mysql_db = 'COG'
    conn = MySQLdb.connect(host=mysql_host,
                           user=mysql_user,
                           passwd=mysql_pwd,
                           db=mysql_db)
    cursor = conn.cursor()

    sql = 'create table COG.locus_tag2gi_hit_%s (accession varchar(100), locus_tag varchar(100), gi INT, COG_id varchar(100),' \
          'index locus_tag (locus_tag), index accession (accession))' % biodb

    cursor.execute(sql)
    conn.commit()
    sql = 'select locus_tag, accession from orthology_detail_%s' % biodb
    sql3 = 'select protein_id, COG_id from COG.cog_2014;'
    sql2 = 'select protein_id, locus_tag from orthology_detail_%s' % biodb
    sql5 = 'select locus_tag,length(translation) from orthology_detail_%s;' % biodb

    server, db = manipulate_biosqldb.load_db(biodb)
    locus2genome_accession = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql))
    protein_id2locus_tag = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql2))
    protein_id2COG = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql3))
    locus_tag2protein_length = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql5))

    for input_blast in input_blast_files:

        locus2data = blast2COG(input_blast, hash2locus_tag_list, cdd_id2cog_id,
                               cog_id2length, locus_tag2protein_length)

        for locus in locus2data:
            sql = 'INSERT into locus_tag2gi_hit_%s (accession, locus_tag, gi, COG_id) VALUES ("%s", "%s", %s, "%s")' % (
                biodb,
                locus2genome_accession[locus],
                locus,
                0,  # no gi anymore
                locus2data[locus]["cog_id"])
            cursor.execute(sql)
    conn.commit()
예제 #7
0
def collect_Pfam_accession(db_name):
    '''

    collect presence/absence of Pfam domains for each genome/plasmid accession

    :param db_name:
    :return:
    '''

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(db_name)

    sql_head = 'INSERT INTO comparative_tables.Pfam_accessions_%s (id,' % db_name

    accession_list = get_all_accessions(db_name)
    sql_head += ','.join(accession_list) + ') values ('

    all_pfam_ids_sql = 'select signature_accession from interpro_%s where analysis="Pfam" ' \
                       'group by signature_accession;' % db_name

    all_pfam_ids = [
        i[0] for i in server.adaptor.execute_and_fetchall(all_pfam_ids_sql, )
    ]

    sql = 'select t1.accession, t1.description from bioentry t1 inner join biodatabase t2 on t1.biodatabase_id=t2.biodatabase_id' \
          ' where t2.name="%s"' % db_name
    accession2organism = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    i = 0
    for accession in all_pfam_ids:
        print(i, '/', len(all_pfam_ids), accession)
        i += 1
        sql= 'select organism, count(*) from biosqldb.interpro_%s ' \
             ' where analysis="Pfam" and signature_accession="%s" group by organism;' % (db_name, accession)

        data = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))

        sql_temp = sql_head + '"%s",' % accession

        for accession in accession_list:
            try:
                sql_temp += '%s,' % data[accession2organism[str(accession)]]
            except:
                sql_temp += '0,'
        sql_temp = sql_temp[0:-1] + ');'
        server.adaptor.execute(sql_temp, )
        server.adaptor.commit()
예제 #8
0
def best_blast_hit_majority_species(biodb):
    '''

    1. for each protein, get the species of the best hit, and calculate the proportion of the
    proteome it represent

    :param biodb:
    :param n_hits:
    :return:
    '''

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select taxon_id from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \
          ' where t1.name="%s" group by taxon_id' % biodb

    taxon_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )]

    sql = 'CREATE table blastnr.blastnr_best_hits_species_%s (taxon_id INT, hit_taxid INT, count INT, proportion FLOAT)' % biodb

    server.adaptor.execute(sql, )
    server.commit()

    sql = 'select taxon_id, count(*) as n from biosqldb.orthology_detail_%s group by taxon_id' % (
        biodb)
    taxon_id2n_CDS = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    for one_taxon in taxon_list:
        print one_taxon

        sql2 = 'select A.subject_taxid, count(*) from blastnr.blastnr_%s A ' \
               ' where A.query_taxon_id=%s and hit_number=1 group by A.subject_taxid;' % (biodb, one_taxon)

        taxon_id2count = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql2, ))
        #print seqfeature2best_hit_phylum
        total_bbh = float(sum([int(i) for i in taxon_id2count.values()]))

        for n, taxon in enumerate(taxon_id2count):
            sql = 'insert into blastnr.blastnr_best_hits_species_%s values(%s,%s,%s, %s)' % (
                biodb, one_taxon, taxon, taxon_id2count[taxon],
                round((taxon_id2count[taxon] / total_bbh) * 100, 2))

            server.adaptor.execute(sql, )

        server.commit()
예제 #9
0
def get_interpro2go_table():
    '''

    :param interpro_release: par exemple: 60.0
    :return:
    '''
    import urllib2
    import MySQLdb
    from chlamdb.biosqldb import manipulate_biosqldb
    import os
    sqlpsw = os.environ['SQLPSW']
    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="interpro")  # name of the data base
    cursor = conn.cursor()

    sql = 'CREATE table interpro2gene_ontology (interpro_id INT, go_id INT, index interpro_id(interpro_id), index go_id(go_id))'
    #cursor.execute(sql,)
    #conn.commit()

    sql = 'select name,interpro_id from entry'
    cursor.execute(sql, )
    interpro_name2interpro_id = manipulate_biosqldb.to_dict(cursor.fetchall())

    sql = 'select acc,id from gene_ontology.term where acc like "GO%"'
    cursor.execute(sql, )
    go_name2go_id = manipulate_biosqldb.to_dict(cursor.fetchall())

    link = 'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro2go'

    req = urllib2.Request(link)
    entry_list = urllib2.urlopen(req)
    echec = 0
    for i, line in enumerate(entry_list):
        if line[0] == '!':
            continue
        interpro_name = line.rstrip().split(':')[1].split(' ')[0]
        go_name = line.rstrip().split(' ; ')[1]
        try:
            sql = 'insert into interpro2gene_ontology values(%s, %s)' % (
                interpro_name2interpro_id[interpro_name],
                go_name2go_id[go_name])
            cursor.execute(sql, )
            conn.commit()
        except:
            echec += 1
    print(echec, i)
예제 #10
0
def seqfeature_id2n_species_chlamydiae_only(biodb, chlamydiae_taxon_list):
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql0 = 'create table if not exists custom_tables.seqfeature_id2n_species_chlamydiae_%s (seqfeature_id INT primary KEY, n_species INT, INDEX n_species(n_species))' % biodb

    server.adaptor.execute(sql0, )

    sql1 = 'select locus_tag, orthogroup from orthology_detail_%s' % biodb

    orthogroup2locus_list = {}
    for row in server.adaptor.execute_and_fetchall(sql1, ):
        if row[1] not in orthogroup2locus_list:
            orthogroup2locus_list[row[1]] = [row[0]]
        else:
            orthogroup2locus_list[row[1]].append(row[0])

    sql2 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql2, ))

    sql3 = 'select taxon_id, species_id from species_%s' % biodb

    taxon_id2species_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql3, ))

    sql4 = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb

    locus2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql4, ))

    for group in orthogroup2locus_list:
        species = []
        locus_keep = []
        for locus in orthogroup2locus_list[group]:
            taxon_id = locus2taxon_id[locus]
            if taxon_id in chlamydiae_taxon_list:
                locus_keep.append(locus)
                species_id = taxon_id2species_id[str(taxon_id)]
                if species_id not in species:
                    species.append(species_id)
        for one_locus in locus_keep:
            seqfeature_id = locus_tag2seqfeature_id[one_locus]
            sql = 'insert into custom_tables.seqfeature_id2n_species_chlamydiae_%s values (%s, %s)' % (
                biodb, seqfeature_id, len(species))
            server.adaptor.execute(sql, )
        server.commit()
예제 #11
0
def load_hmm_data(biodb, database_name, table_name, hmm_tab_files):

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table %s.%s (locus_tag varchar(400), taxon_id INT, hmm_accession varchar(400));' % (database_name,
                                                                                                      table_name)

    server.adaptor.execute_and_fetchall(sql,)

    sql = 'select locus_tag, taxon_id from biosqldb.orthology_detail_%s' % biodb

    locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))

    for hmm_file in hmm_tab_files:
        print hmm_file
        with open(hmm_file, 'r') as f:
            for row in f:
                if row[0] == '#':
                    continue
                else:
                    data = row.rstrip().split()
                    locus_tag = data[0]
                    hit_accession = data[3]
                    print '%s\t%s\t%s' % (hit_accession, locus_tag, locus2taxon_id[locus_tag])
                    sql = 'insert into %s.%s values ("%s", %s, "%s")' % (database_name, table_name,locus_tag, locus2taxon_id[locus_tag], hit_accession)
                    server.adaptor.execute(sql,)
        server.adaptor.commit()
예제 #12
0
def get_whole_db_interpro_taxonomy():
    import MySQLdb
    from chlamdb.biosqldb import manipulate_biosqldb
    import os
    sqlpsw = os.environ['SQLPSW']
    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="interpro")  # name of the data base
    cursor = conn.cursor()

    interpro_vesrion = get_interpro_version()

    create_interpro_taxonomy_table(interpro_vesrion)

    sql = 'select name,interpro_id from interpro.entry'
    cursor.execute(sql, )
    interpro_accession2interpro_id = manipulate_biosqldb.to_dict(
        cursor.fetchall())
    for i, interpro_accession in enumerate(interpro_accession2interpro_id):
        print '%s / %s -- %s' % (i, len(interpro_accession2interpro_id),
                                 interpro_accession)
        euk, bact, arch, virus = interpro_entry2taxonomy(interpro_accession)

        total = float(euk + bact + arch + virus)
        sql = 'insert into interpro_taxonomy_v_%s values (%s,%s,%s,%s,%s,%s,%s,%s, %s, %s)' % (
            int(interpro_vesrion),
            interpro_accession2interpro_id[interpro_accession], euk, bact,
            arch, virus, total, round((euk / total) * 100, 2),
            round((bact / total) * 100, 2), round(
                (arch / total) * 100, 2), round((virus / total) * 100, 2))

        cursor.execute(sql, )
        conn.commit()
예제 #13
0
def load_sport_data(psortdb_output_files,
                    db_name,
                    hash2locus_list):

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(db_name)

    hash2psort_results = {}
    for psortdb_output in psortdb_output_files:
        hash2psort_results.update(parse_psort_results(psortdb_output))
    
    
    sql = f'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_{db_name}'
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    
    sql = f'create table if not exists custom_tables.seqfeature_id2psortb_{db_name} (seqfeature_id INT, final_prediction varchar(200), score FLOAT)'
    server.adaptor.execute(sql,)
    
    sql_template = f'insert into custom_tables.seqfeature_id2psortb_{db_name} values (%s, %s, %s)'
    
    for hash in hash2psort_results:
        for locus in hash2locus_list[hash]:
            final_pred = hash2psort_results[hash]['final_prediction']
            if final_pred == 'Unknown':
                score = None
            else:
                score = hash2psort_results[hash][final_pred]
            server.adaptor.execute(sql_template, (locus_tag2seqfeature_id[locus],
                                                  final_pred,
                                                  score))
    
    sql = f'create index sfp on custom_tables.seqfeature_id2psortb_{db_name}(seqfeature_id)'                 
    server.adaptor.execute(sql,)
    server.commit()
예제 #14
0
def locus2ec_table(hash2ec_dico, biodatabase, hash2locus_list):

    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.biosqldb import biosql_own_sql_tables

    server, db = manipulate_biosqldb.load_db(biodatabase)

    sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.seqfeature_id2ec_%s (enzyme_id INT AUTO_INCREMENT PRIMARY KEY,' \
           ' seqfeature_id INT,' \
           ' ec_id INT,' \
           ' FOREIGN KEY(ec_id) REFERENCES enzymes(enzyme_id));' % (biodatabase)

    server.adaptor.execute_and_fetchall(sql2, )

    sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodatabase

    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    for hash in hash2ec_dico:
        for ec_data in hash2ec_dico[hash]:
            for locus in hash2locus_list[hash]:
                sql = 'select enzyme_id from enzyme.enzymes where ec="%s"' % ec_data[
                    0]
                ec_id = server.adaptor.execute_and_fetchall(sql, )[0][0]
                seqfeature_id = locus_tag2seqfeature_id[locus]
                sql = 'insert into enzyme.seqfeature_id2ec_%s (seqfeature_id, ec_id) values (%s, %s)' % (
                    biodatabase, seqfeature_id, ec_id)
                server.adaptor.execute(sql, )
    server.commit()
예제 #15
0
def locus2ko_table(locus_tag2ko_dico, biodatabase, ko_accession2ko_id):

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodatabase)

    sql2 = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodatabase

    locus2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql2))

    sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.seqfeature_id2ko_%s (seqfeature_id INT,' \
           ' ko_id INT, ' \
           ' index ko_id (ko_id),' \
           ' index seqid (seqfeature_id));' % (biodatabase)

    server.adaptor.execute_and_fetchall(sql2, )

    for locus in locus_tag2ko_dico:
        ko = locus_tag2ko_dico[locus]
        ko_id = ko_accession2ko_id[ko]
        seqfeature_id = locus2seqfeature_id[locus]

        sql = 'insert into enzyme.seqfeature_id2ko_%s (seqfeature_id, ko_id) values (%s, %s)' % (
            biodatabase, seqfeature_id, ko_id)

        server.adaptor.execute(sql, )
    server.commit()
예제 #16
0
def load_hash2locus(hash2locus_list, biodb):
    import MySQLdb
    import os
    from chlamdb.biosqldb import manipulate_biosqldb
    sqlpsw = os.environ['SQLPSW']
    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw)  # name of the data base
    cursor = conn.cursor()

    sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodb
    cursor.execute(sql, )
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(cursor.fetchall())

    sql = 'create table annotation.hash2seqfeature_id_%s (hash varchar(300), seqfeature_id INTEGER)' % biodb
    cursor.execute(sql, )

    for hash in hash2locus_list:
        locus_list = hash2locus_list[hash]
        for locus in locus_list:
            cursor.execute(
                'insert into annotation.hash2seqfeature_id_%s values ("%s", "%s")'
                % (biodb, hash, locus_tag2seqfeature_id[locus]))
    conn.commit()
    sql1 = 'create index h1 ON annotation.hash2seqfeature_id_%s(hash)' % biodb
    sql2 = 'create index h2 ON annotation.hash2seqfeature_id_%s(seqfeature_id)' % biodb
    cursor.execute(sql1, )
    cursor.execute(sql2, )
    conn.commit()
예제 #17
0
def get_pairwise_connexions(accession_1, accession_2, biodb):
    from chlamdb.biosqldb import manipulate_biosqldb
    import numpy
    import pandas

    server, db = manipulate_biosqldb.load_db(biodb)

    sql1 = 'select seqfeature_id, start, stop from biosqldb.orthology_detail_%s where accession in ("%s","%s") ' % (biodb,
                                                                                                                   accession_1,
                                                                                                                   accession_2)
    seqfeature_id2location = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,))

    print seqfeature_id2location.keys()[0:10]

    sql2 = 'select accession, taxon_id from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \
           ' where t1.name="%s"' % biodb
    print sql2


    accession2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,))

    comp1_sql = 'select locus_1,locus_2,identity from (select * from ' \
                ' comparative_tables.identity_closest_homolog2_%s where taxon_1=%s and taxon_2=%s) A ' \
                ' inner join biosqldb.orthology_detail_%s B on A.locus_1=B.seqfeature_id;' % (biodb,
                                                                                              accession2taxon_id[accession_1],
                                                                                              accession2taxon_id[accession_2],
                                                                                              biodb)


    data = server.adaptor.execute_and_fetchall(comp1_sql,)

    comparison_table = []
    for row in data:
        print row
        try:
            start1 = seqfeature_id2location[int(row[0])][0]
            stop1 = seqfeature_id2location[int(row[0])][0]
            start2 = seqfeature_id2location[int(row[1])][0]
            stop2 = seqfeature_id2location[int(row[1])][0]
            identity = row[2]
            comparison_table.append([start1, stop1, start2, stop2, identity])
        except:
            pass
    data = numpy.array(comparison_table)
    columns = ['start1', 'end1', 'start2', 'end2', 'identity']
    df = pandas.DataFrame(data, columns=columns)
    return df
예제 #18
0
def load_effectiveT3_table(table_file, biodb, hash2locus_list):
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create database if not exists effectors;'
    server.adaptor.execute(sql, )

    sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb
    locus_tag2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'CREATE table IF NOT EXISTS effectors.predicted_effectiveT3_%s (seqfeature_id INT primary key,taxon_id INT,score FLOAT,effectors INT,' \
            ' INDEX taxon_id (taxon_id))' % biodb

    server.adaptor.execute(sql, )

    with open(table_file, 'r') as f:
        for i, row in enumerate(f):
            if row[0] == '#':
                continue
            data = row.rstrip().split(';')
            if data[0] == 'Id':
                continue
            hash = data[0]
            if data[3] == 'false':
                effector = 0
                continue
            elif data[3] == 'true':
                effector = 1
            else:
                print('unknown result', data)

            for locus in hash2locus_list[hash]:
                # CRC-02125D79C6DFBC1D; RhT_01099 Rhabdochlamydia helvetica T3358;1.000000000000000;true
                sql = 'insert into effectors.predicted_effectiveT3_%s values (%s,%s,%s,%s)' % (
                    biodb, locus_tag2seqfeature_id[locus],
                    locus_tag2taxon_id[locus], data[2], effector)
                server.adaptor.execute(sql, )
            server.commit()
예제 #19
0
def biodb2cds_gc(biodb):
    from chlamdb.biosqldb import manipulate_biosqldb
    from Bio.SeqUtils import GC123

    server, db = manipulate_biosqldb.load_db(biodb)

    sql1 = 'select distinct accession from orthology_detail_%s' % biodb
    sql2 = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb
    sql3 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb

    accession_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql1,)]

    locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,))
    locus2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql3,))

    sql_head = 'create table IF NOT EXISTS custom_tables.gc_content_%s (taxon_id INT, ' \
          ' seqfeature_id INT,' \
          ' seq_length INT, gc_percent FLOAT, gc_1 FLOAT, gc_2 FLOAT, gc_3 FLOAT, ' \
               ' INDEX seqfeature_id(seqfeature_id), index taxon_id(taxon_id))' % biodb

    server.adaptor.execute(sql_head,)

    count_all=0
    for accession in accession_list:
        print (accession)
        record = db.lookup(accession=accession)
        seq = record.seq
        for n, feature in enumerate(record.features):
            if feature.type == 'CDS' and not 'pseudo' in feature.qualifiers and not 'pseudogene' in feature.qualifiers and 'translation' in feature.qualifiers:
                count_all+=1
                dna_sequence = feature.extract(seq)
                locus = feature.qualifiers['locus_tag'][0]

                gc, gc1, gc2, gc3 = GC123(str(dna_sequence))
                sql = 'insert into  custom_tables.gc_content_%s values (%s, %s, %s, %s, %s, %s, %s);' % (biodb,
                                                                                                          locus2taxon_id[locus],
                                                                                                          locus2seqfeature_id[locus],
                                                                                                          len(dna_sequence),
                                                                                                          round(gc,2),
                                                                                                          round(gc1),
                                                                                                          round(gc2),
                                                                                                          round(gc3))
                server.adaptor.execute(sql,)
        server.commit()
예제 #20
0
def import_silix(silix_output, biodb, silix_cutoff):
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)
    import re
    # columns
    # 0 taxon_id
    # 1 seqfeature_id
    # 2 silix_family_id

    sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb
    locus2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    sql = 'select locus_tag, taxon_id from biosqldb.orthology_detail_%s' % biodb
    locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    sql = 'create table IF NOT EXISTS comparative_tables.silix_%s_%s (taxon_id INT, ' \
          ' seqfeature_id INT, ' \
          ' silix_id INT,' \
          ' INDEX seqfeature_id (seqfeature_id), index taxon_id(taxon_id), index silix_id(silix_id));' % (biodb, silix_cutoff)
    server.adaptor.execute(sql,)
    server.commit()
    with open(silix_output, 'r') as f:
        row_list = [i for i in f]
        family_list = list(set([i.rstrip().split('\t')[0] for i in row_list]))

        for n, row in enumerate(row_list):
            if n == 0:
                continue
            data = row.rstrip().split('\t')

            family_index = family_list.index(data[0])
            locus_tag = data[1]
            taxon_id = locus2taxon_id[locus_tag]
            seqfeature_id = locus2seqfeature_id[locus_tag]
            print family_index, locus_tag

            sql = 'insert into comparative_tables.silix_%s_%s values (%s,  %s, %s);' % (biodb,
                                                                 silix_cutoff,
                                                                 taxon_id,
                                                                 seqfeature_id,
                                                                 family_index)

            server.adaptor.execute(sql,)

        server.commit()
예제 #21
0
def load_T3_MM_table(table_file, biodb, hash2locus_list):
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create database if not exists effectors;'
    server.adaptor.execute(sql, )

    sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb
    locus_tag2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'CREATE table effectors.predicted_T3MM_%s (seqfeature_id INT primary key,taxon_id INT,value FLOAT,effectors INT,' \
            ' probability FLOAT, INDEX taxon_id (taxon_id))' % biodb

    server.adaptor.execute(sql, )

    with open(table_file, 'r') as f:
        for row in f:
            data = row.rstrip().split(',')
            if data[0] == 'seqName':
                continue
            else:
                hash = data[0]
                if data[2] == 'NO':
                    effector = 0
                    continue
                elif data[2] == 'YES':
                    effector = 1
                else:
                    print('unknown result', data)
                for locus in hash2locus_list[hash]:

                    # seqName,value,T3SE,probability
                    sql = 'insert into effectors.predicted_T3MM_%s values (%s,%s,%s,%s, %s)' % (
                        biodb, locus_tag2seqfeature_id[locus],
                        locus_tag2taxon_id[locus], data[1], effector, data[3])
                    server.adaptor.execute(sql, )
                server.commit()
예제 #22
0
def load_chaperones_table(table_file, biodb='chlamydia_04_16'):
    from chlamdb.biosqldb import manipulate_biosqldb
    import re
    '''
    ATTENTION possible d'avoir plusieurs sequences cibles identifiees dans une meme sequence: dubplicate seqfeature dans la table
    '''

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb
    locus_tag2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'CREATE table IF NOT EXISTS effectors.predicted_chaperones_%s (seqfeature_id INT,taxon_id INT,motif varchar(400), info TEXT,' \
          ' INDEX taxon_id (taxon_id), index seqfeature_id (seqfeature_id))' % biodb

    server.adaptor.execute(sql, )

    with open(table_file, 'r') as f:
        for i, row in enumerate(f):
            data = row.rstrip().split('\t')
            if data[1] == 'Protein does not contain a CCBD within the first 150 amino acids':
                continue
            else:
                motif = data[1].split('(')[1].split(')')[0]
                info = data[1].split(') ')[1]

            #Protein,SVM-Value,T4S protein or not
            #locus = re.sub('"', "", data[0])
            sql = 'insert into effectors.predicted_chaperones_%s values (%s,%s,"%s", "%s")' % (
                biodb, locus_tag2seqfeature_id[data[0]],
                locus_tag2taxon_id[data[0]], motif, info)
            try:
                server.adaptor.execute(sql, )
                server.commit()
            except:
                print data
                import sys
                sys.exit()
예제 #23
0
def update_analysis_dico(server):

    from chlamdb.biosqldb import manipulate_biosqldb

    sql = 'select analysis_name, analysis_id from interpro.analysis'

    analysis_nam2analysis_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    return analysis_nam2analysis_id
예제 #24
0
def load_DeepT3_table(table_file, biodb, hash2locus_list):
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create database if not exists effectors;'
    server.adaptor.execute(sql, )

    sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb
    locus_tag2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'CREATE table IF NOT EXISTS effectors.predicted_DeepT3_%s (seqfeature_id INT primary key,taxon_id INT,effectors INT,' \
            ' INDEX taxon_id (taxon_id))' % biodb

    server.adaptor.execute(sql, )

    with open(table_file, 'r') as f:
        for i, row in enumerate(f):
            data = row.rstrip().split('\t')
            if i < 3:
                continue
            else:
                hash = data[0]
                if data[1] == 'non-T3SE':
                    effector = 0
                    continue
                elif data[1] == 'T3SE':
                    effector = 1
                else:
                    print('unknown result', data)
                for locus in hash2locus_list[hash]:
                    # CRC-20ACA5744563D026	non-T3SE
                    sql = 'insert into effectors.predicted_DeepT3_%s values (%s,%s,%s)' % (
                        biodb, locus_tag2seqfeature_id[locus],
                        locus_tag2taxon_id[locus], effector)
                    server.adaptor.execute(sql, )
                server.commit()
예제 #25
0
    def __init__(self, string_sqlite, blast_results, db_name, hash2locus_list,
                 query_fasta_file, db_fasta_file):

        self.string_conn = sqlite3.connect(string_sqlite)
        self.string_cursor = self.string_conn.cursor()

        self.server, self.db = manipulate_biosqldb.load_db(db_name)
        self.blast_results = blast_results
        self.hash2locus_list = hash2locus_list
        self.db_name = db_name
        self.query2len = self.parse_fasta(query_fasta_file)
        self.hit2len = self.parse_fasta(db_fasta_file)

        self.missing_pmid_data = []

        # create tables if not exists
        self.create_tables()

        # retrieve data from string db
        sql = 'select pmid,publication_date,publication_source,linkout_url,authors,title from publications;'

        sql_species = 'select species_id,compact_name from species'
        self.pmid2article_data = manipulate_biosqldb.to_dict(
            self.string_cursor.execute(sql, ).fetchall())
        self.species_id2species_name = manipulate_biosqldb.to_dict(
            self.string_cursor.execute(sql_species).fetchall())

        # retrieve protein hash from biosqldb
        sql = f'select distinct hash from string.seqfeature_id2string_protein_mapping t1 inner join annotation.hash2seqfeature_id_{db_name} t2 on t1.seqfeature_id=t2.seqfeature_id;'
        self.hash_in_db = set(
            [i[0] for i in self.server.adaptor.execute_and_fetchall(sql, )])

        # check if some data were already inserted
        sql = 'select pmid from string.pmid2data_stringdb'
        self.pmid_in_db = [
            i[0] for i in self.server.adaptor.execute_and_fetchall(sql, )
        ]

        sql = 'select accession,id from string.string_protein_entry'
        self.string_proteins2string_protein_id = manipulate_biosqldb.to_dict(
            self.server.adaptor.execute_and_fetchall(sql, ))
예제 #26
0
def load_T4SEpre_psAac_table(table_file, biodb='chlamydia_04_16'):
    from chlamdb.biosqldb import manipulate_biosqldb
    import re

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb
    locus_tag2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    sql = 'CREATE table IF NOT EXISTS effectors.predicted_T4SEpre_psAac_%s (seqfeature_id INT primary key,taxon_id INT,SVM_value FLOAT,effectors INT,' \
          ' INDEX taxon_id (taxon_id))' % biodb

    server.adaptor.execute(sql, )

    with open(table_file, 'r') as f:
        for i, row in enumerate(f):
            data = row.rstrip().split(',')
            if i < 2:
                continue
            else:

                if data[2] == 'NO':
                    effector = 0
                    continue
                elif data[2] == 'YES':
                    effector = 1
                else:
                    print 'unknown result', data

                #Protein,SVM-Value,T4S protein or not
                #locus = re.sub('"', "", data[0])
                sql = 'insert into effectors.predicted_T4SEpre_psAac_%s values (%s,%s,%s,%s)' % (
                    biodb, locus_tag2seqfeature_id[data[0]],
                    locus_tag2taxon_id[data[0]], data[1], effector)
                server.adaptor.execute(sql, )
                server.commit()
예제 #27
0
def blastswiss2biosql( locus_tag2seqfeature_id,
                    db_name,
                    n_procs,
                    mysql_host,
                    mysql_user,
                    mysql_pwd,
                    mysql_db,
                    hash2locus_list,
                    *input_blast_files):

    import numpy
    from multiprocessing import Process

    create_sql_blast_swissprot_tables(db_name,
                                      mysql_host,
                                      mysql_user,
                                      mysql_pwd,mysql_db)


    print ('get locus2taxon_id')
    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % db_name
    locus_tag2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    print ('get locus2bioentry')
    sql2 = 'select locus_tag,bioentry_id from biodatabase t1 ' \
           ' inner join bioentry as t2 on t1.biodatabase_id=t2.biodatabase_id' \
           ' inner join orthology_detail_%s t3 on t2.accession=t3.accession where t1.name="%s"' % (db_name,db_name)

    locus_tag2bioentry_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,))

    load_blastswissprot_file_into_db(locus_tag2taxon_id,
                            locus_tag2seqfeature_id,
                            locus_tag2bioentry_id,
                            mysql_host,
                            mysql_user,
                            mysql_pwd,
                            mysql_db,
                            input_blast_files,
                            biodb,
                            hash2locus_list)

    sys.stdout.write("done!")
예제 #28
0
def parse_orthofinder_blast_files(biodb,
                                  orthofinder_blast_file_list,
                                  locus_tag2orthofinder_id,
                                  evalue_cutoff=0.01,
                                  identity_cutoff=0,
                                  sqlite3=False):

    from chlamdb.biosqldb import manipulate_biosqldb
    if not sqlite3:
        server, db = manipulate_biosqldb.load_db(biodb)
    else:
        server, db = manipulate_biosqldb.load_db(biodb, sqlite3)

    sql = 'select locus_tag, taxon_id from annotation.seqfeature_id2locus_%s' % biodb

    locus_tag2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    locus2taxon2best_hit_id = {}
    for n, file in enumerate(orthofinder_blast_file_list):
        print(n, len(orthofinder_blast_file_list))
        with open(file, 'r') as f:
            for line in f:
                #9_1	60_294	56.647	346	139	3	4	338	5	350	3.33e-141	405
                data = line.split('\t')
                seq_1_id = data[0]
                seq_2_id = data[1]
                locus_tag_1 = locus_tag2orthofinder_id[seq_1_id]
                locus_tag_2 = locus_tag2orthofinder_id[seq_2_id]
                identity = data[2]
                evalue = data[10]
                bitscore = data[11]
                query_align_length = int(data[7]) - int(data[6])
                hit_align_length = int(data[9]) - int(data[8])
                taxon_id_seq_2 = locus_tag2taxon_id[locus_tag_2]
                if locus_tag_1 not in locus2taxon2best_hit_id:
                    locus2taxon2best_hit_id[locus_tag_1] = {}
                    locus2taxon2best_hit_id[locus_tag_1][taxon_id_seq_2] = [
                        locus_tag_2, identity, evalue, bitscore,
                        query_align_length, hit_align_length
                    ]
                else:
                    # not the best hit, skip
                    if taxon_id_seq_2 in locus2taxon2best_hit_id[locus_tag_1]:
                        continue
                    # best hit
                    else:
                        locus2taxon2best_hit_id[locus_tag_1][
                            taxon_id_seq_2] = [
                                locus_tag_2, identity, evalue, bitscore,
                                query_align_length, hit_align_length
                            ]
    return locus2taxon2best_hit_id
예제 #29
0
def identity_closest_homolog(db_name):

    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.biosqldb import biosql_own_sql_tables
    import sys

    server, db = manipulate_biosqldb.load_db(db_name)

    sql1 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % db_name
    locus2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql1, ))

    sql2 = "CREATE TABLE comparative_tables.identity_closest_homolog2_%s(taxon_1 INT NOT NULL," \
          " taxon_2 INT NOT NULL," \
          " locus_1 INT NOT NULL," \
          " locus_2 INT NOT NULL," \
          " identity FLOAT, index locus_1(locus_1)," \
          " index locus_2(locus_2), index taxon_1(taxon_1), index taxon_2(taxon_2))" % (db_name)

    server.adaptor.execute(sql2)

    #identitydico = biosql_own_sql_tables.calculate_average_protein_identity_new_tables(db_name)
    taxon2description = manipulate_biosqldb.taxon_id2genome_description(
        server, biodatabase_name=db_name)

    all_taxons = taxon2description.keys()
    for i, taxon_1 in enumerate(all_taxons):

        locus2identity = biosql_own_sql_tables.circos_locus2taxon_highest_identity(
            db_name, taxon_1)

        for taxon_2 in all_taxons:
            if taxon_1 == taxon_2:
                continue

            for locus in locus2identity:
                try:
                    #print taxon_1, taxon_2, locus, locus2identity[locus][long(taxon_2)][1], locus2identity[locus][long(taxon_2)][0]
                    #sys.stdout.write("%s\t%s\n" % (taxon_1, taxon_2))
                    sql = 'insert into comparative_tables.identity_closest_homolog2_%s(taxon_1, taxon_2, locus_1, locus_2, identity) ' \
                          ' VALUES ("%s", "%s", "%s", "%s", %s)' % (db_name,
                                                                    taxon_1,
                                                                    taxon_2,
                                                                    locus2seqfeature_id[locus],
                                                                    locus2seqfeature_id[locus2identity[locus][int(taxon_2)][1]],
                                                                    locus2identity[locus][int(taxon_2)][0])
                    server.adaptor.execute(sql)

                except KeyError:
                    # no homologs
                    continue
        server.adaptor.commit()
예제 #30
0
def get_pathway_ko_association_table():
    import os
    import MySQLdb
    from chlamdb.biosqldb import manipulate_biosqldb
    import urllib
    from Bio.KEGG.KGML import KGML_parser
    import re

    sqlpsw = os.environ['SQLPSW']

    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="enzyme")  # name of the data base
    cursor = conn.cursor()

    sql = 'create table enzyme.pathway2ortholog_associations (pathway_id INT, node_id INT, ko_id varchar(200), ' \
          ' index pathway_id(pathway_id), index node_id(node_id), index ko_id(ko_id));'
    cursor.execute(sql, )
    conn.commit()

    sql2 = 'select pathway_name,pathway_id from enzyme.kegg_pathway'
    cursor.execute(sql2, )

    pathway2pathway_id = manipulate_biosqldb.to_dict(cursor.fetchall())

    for pathway in pathway2pathway_id:
        print(pathway)

        url_template = 'http://rest.kegg.jp/get/%s/kgml' % re.sub(
            'map', 'ko', pathway)
        print(url_template)
        try:
            f = urllib.request.urlopen(url_template)
        except:
            continue
        from Bio.Graphics import KGML_vis

        pathway_KGML = KGML_parser.read(f.read().decode("UTF-8"))

        # Loop over the orthologs in the pathway, and change the
        # background colour
        orthologs = [e for e in pathway_KGML.orthologs]
        for o in orthologs:
            ko_temp_list = list(set([i.rstrip() for i in o.name.split('ko:')]))
            ko_temp_list = filter(None, ko_temp_list)
            for ko in ko_temp_list:
                sql = 'insert into enzyme.pathway2ortholog_associations values(%s, %s, "%s")' % (
                    pathway2pathway_id[pathway], o.id, ko)
                cursor.execute(sql, )
        conn.commit()