Ejemplo n.º 1
0
def parse_orthofinder_blast_files(biodb,
                                  orthofinder_blast_file_list,
                                  locus_tag2orthofinder_id,
                                  evalue_cutoff=0.01,
                                  identity_cutoff=0,
                                  sqlite3=False):

    from chlamdb.biosqldb import manipulate_biosqldb
    if not sqlite3:
        server, db = manipulate_biosqldb.load_db(biodb)
    else:
        server, db = manipulate_biosqldb.load_db(biodb, sqlite3)

    sql = 'select locus_tag, taxon_id from annotation.seqfeature_id2locus_%s' % biodb

    locus_tag2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    locus2taxon2best_hit_id = {}
    for n, file in enumerate(orthofinder_blast_file_list):
        print(n, len(orthofinder_blast_file_list))
        with open(file, 'r') as f:
            for line in f:
                #9_1	60_294	56.647	346	139	3	4	338	5	350	3.33e-141	405
                data = line.split('\t')
                seq_1_id = data[0]
                seq_2_id = data[1]
                locus_tag_1 = locus_tag2orthofinder_id[seq_1_id]
                locus_tag_2 = locus_tag2orthofinder_id[seq_2_id]
                identity = data[2]
                evalue = data[10]
                bitscore = data[11]
                query_align_length = int(data[7]) - int(data[6])
                hit_align_length = int(data[9]) - int(data[8])
                taxon_id_seq_2 = locus_tag2taxon_id[locus_tag_2]
                if locus_tag_1 not in locus2taxon2best_hit_id:
                    locus2taxon2best_hit_id[locus_tag_1] = {}
                    locus2taxon2best_hit_id[locus_tag_1][taxon_id_seq_2] = [
                        locus_tag_2, identity, evalue, bitscore,
                        query_align_length, hit_align_length
                    ]
                else:
                    # not the best hit, skip
                    if taxon_id_seq_2 in locus2taxon2best_hit_id[locus_tag_1]:
                        continue
                    # best hit
                    else:
                        locus2taxon2best_hit_id[locus_tag_1][
                            taxon_id_seq_2] = [
                                locus_tag_2, identity, evalue, bitscore,
                                query_align_length, hit_align_length
                            ]
    return locus2taxon2best_hit_id
Ejemplo n.º 2
0
def create_comparative_tables_accession(db_name, table_name):
    '''
    create a presence/absence matrix based on accession (and not taxon_ids)
    '''

    # create id column + one_column per taxon_id
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(db_name)

    accession_list = get_all_accessions(db_name)

    sql = "CREATE TABLE comparative_tables.%s_accessions_%s(id VARCHAR(100) NOT NULL" % (
        table_name, db_name)

    for i in accession_list:
        sql += " ,%s INT" % i

    #for i in accession_list:
    #    sql+=" ,index %s(%s)" % (i, i)
    sql += ")"
    server.adaptor.execute(sql)

    sql_index = 'CREATE index %s on comparative_tables.%s_accessions_%s(id)' % (
        randomString(5), table_name, db_name)
    server.adaptor.execute(sql_index)
Ejemplo n.º 3
0
def load_hmm_data(biodb, database_name, table_name, hmm_tab_files):

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table %s.%s (locus_tag varchar(400), taxon_id INT, hmm_accession varchar(400));' % (database_name,
                                                                                                      table_name)

    server.adaptor.execute_and_fetchall(sql,)

    sql = 'select locus_tag, taxon_id from biosqldb.orthology_detail_%s' % biodb

    locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))

    for hmm_file in hmm_tab_files:
        print hmm_file
        with open(hmm_file, 'r') as f:
            for row in f:
                if row[0] == '#':
                    continue
                else:
                    data = row.rstrip().split()
                    locus_tag = data[0]
                    hit_accession = data[3]
                    print '%s\t%s\t%s' % (hit_accession, locus_tag, locus2taxon_id[locus_tag])
                    sql = 'insert into %s.%s values ("%s", %s, "%s")' % (database_name, table_name,locus_tag, locus2taxon_id[locus_tag], hit_accession)
                    server.adaptor.execute(sql,)
        server.adaptor.commit()
Ejemplo n.º 4
0
def blast_sets_form(database_name):
    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(database_name)

    sql = 'select distinct name from blast.blast_sets;'  #% database_name
    categories = server.adaptor.execute_and_fetchall(sql, )
    CHOICES = [(i[0], i[0]) for i in categories]
    CHOICES.append(("all", "all"))

    class BlastSetChoice(forms.Form):
        blast_set = forms.MultipleChoiceField(
            choices=CHOICES,
            widget=forms.SelectMultiple(attrs={'size': '20'}),
            required=False)

        score_cutoff = forms.CharField(max_length=3,
                                       label="Bitscore cutoff",
                                       initial=10,
                                       required=False)
        query_coverage_cutoff = forms.CharField(max_length=3,
                                                label="Query coverage cutoff",
                                                initial=0.5,
                                                required=False)
        hit_coverage_cutoff = forms.CharField(max_length=3,
                                              label="Query coverage cutoff",
                                              initial=0.5,
                                              required=False)

    return BlastSetChoice
Ejemplo n.º 5
0
def locus2inference_table(biodb):

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'CREATE TABLE locus_tag2uniprot_hit_%s (locus_tag varchar(400),' \
          ' uniprot_id varchar(400), index locus_tag(locus_tag))' % biodb

    server.adaptor.execute(sql, )

    locus2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict(
        server, biodb)

    for locus in locus2seqfeature_id:
        sql = 'select value from seqfeature_qualifier_value where seqfeature_id=%s and value like "%%%%UniProtKB%%%%"' % (
            locus2seqfeature_id[locus])
        try:
            data = server.adaptor.execute_and_fetchall(sql, )[0][0]
            sql2 = 'insert into locus_tag2uniprot_hit_%s values ("%s", "%s")' % (
                biodb, locus, data.split(':')[2])
            try:
                server.adaptor.execute(sql2, )
                server.commit()
            except:
                print sql2
        except:
            pass
Ejemplo n.º 6
0
def locus2ec_table(hash2ec_dico, biodatabase, hash2locus_list):

    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.biosqldb import biosql_own_sql_tables

    server, db = manipulate_biosqldb.load_db(biodatabase)

    sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.seqfeature_id2ec_%s (enzyme_id INT AUTO_INCREMENT PRIMARY KEY,' \
           ' seqfeature_id INT,' \
           ' ec_id INT,' \
           ' FOREIGN KEY(ec_id) REFERENCES enzymes(enzyme_id));' % (biodatabase)

    server.adaptor.execute_and_fetchall(sql2, )

    sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodatabase

    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    for hash in hash2ec_dico:
        for ec_data in hash2ec_dico[hash]:
            for locus in hash2locus_list[hash]:
                sql = 'select enzyme_id from enzyme.enzymes where ec="%s"' % ec_data[
                    0]
                ec_id = server.adaptor.execute_and_fetchall(sql, )[0][0]
                seqfeature_id = locus_tag2seqfeature_id[locus]
                sql = 'insert into enzyme.seqfeature_id2ec_%s (seqfeature_id, ec_id) values (%s, %s)' % (
                    biodatabase, seqfeature_id, ec_id)
                server.adaptor.execute(sql, )
    server.commit()
Ejemplo n.º 7
0
def make_kegg_form(database_name):
    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(database_name)

    sql_pathways = 'select description,description from enzyme.locus2ko_%s t1 ' \
                   ' inner join  enzyme.pathway2ko t2 ' \
                   ' on t1.ko_id = t2.ko_id ' \
                   ' inner join  enzyme.kegg_pathway t3 ' \
                   ' on t3.pathway_id=t2.pathway_id group by description;' % (database_name)

    pathway_choices = server.adaptor.execute_and_fetchall(sql_pathways, )

    sql_modules = 'select description,description from  enzyme.locus2ko_%s t1 ' \
                  ' inner join  enzyme.module2ko t2 on t1.ko_id = t2.ko_id ' \
                  ' inner join  enzyme.kegg_module t3 on t3.module_id=t2.module_id group by description;' % database_name

    module_choices = server.adaptor.execute_and_fetchall(sql_modules, )

    class KeggForm(forms.Form):

        pathway_choice = forms.MultipleChoiceField(
            label='',
            choices=pathway_choices,
            widget=forms.SelectMultiple(attrs={'size': '%s' % "17"}),
            required=False)
        module_choice = forms.MultipleChoiceField(
            choices=module_choices,
            widget=forms.SelectMultiple(attrs={'size': '%s' % "17"}),
            required=False,
            label="")

    return KeggForm
Ejemplo n.º 8
0
def find_links_recusrsive(biodb, all_connected_seqfeatures, ratio_cutoff=0.5, n_comp_cutoff=1):

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    if biodb == 'chlamydia_04_16':
        filter = '"' + '","'.join(all_connected_seqfeatures) + '"'

        sql = 'select locus_1, locus_2 from interactions.colocalization_table_locus_%s where (locus_1 in (%s) or locus_2 in (%s)) and ' \
              ' (ratio >= %s and n_comparisons >= %s)' % (biodb, filter, filter, ratio_cutoff, n_comp_cutoff)
    else:
        all_connected_seqfeatures = [str(i) for i in all_connected_seqfeatures]
        filter = ','.join(all_connected_seqfeatures)

        sql = 'select locus_1, locus_2 from interactions.colocalization_table_locus_%s where (locus_1 in (%s) or locus_2 in (%s)) and ' \
              ' (ratio >= %s and n_comparisons >= %s)' % (biodb, filter, filter, ratio_cutoff, n_comp_cutoff)

        #print sql
    data = server.adaptor.execute_and_fetchall(sql,)

    all_groups = []
    for i in data:
        if i[0] not in all_groups:
            all_groups.append(i[0])
        if i[1] not in all_groups:
            all_groups.append(i[1])
    if len(all_groups) > len(all_connected_seqfeatures):
        return find_links_recusrsive(biodb, all_groups, ratio_cutoff, n_comp_cutoff)
    else:
        return all_groups
Ejemplo n.º 9
0
    def add_average_orthogroup_identity(self, biodatabase_name):
        #print 'adding average id to orthology table'
        server, db = manipulate_biosqldb.load_db(biodatabase_name)

        #print 'get orthogroups'
        sql = 'select orthogroup from comparative_tables.orthology_%s' % biodatabase_name
        try:
            #print 'adding column'
            self._create_orthogroup_average_identity_column(server, biodatabase_name)
        except:
            print ("column already created?")
        groups = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )]
        #print len(groups)

        for group in groups:
            #print "group %s" % group
            try:
                id_table = np.array(get_orthogroup_identity_table(biodatabase_name, group))
                id_matrix = id_table[:,1:].astype(float)
                #print np.mean(id_matrix)
                #print self._get_average_identity_from_identity_matrix(id_matrix)
                av_id = round(np.mean(id_matrix[np.triu_indices(len(id_matrix), k=1)]), 2)
                #print av_id
            except:
                av_id = 0

            sql = 'insert into orth_%s.average_identity values ("%s", %s)' % (biodatabase_name, group, av_id)
            #print sql
            server.adaptor.execute(sql)
        server.commit()
Ejemplo n.º 10
0
def insert_new_tc_path(biodb, tc_name):
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    separated_ids = tc_name.split('.')
    if len(separated_ids) != 5:
        print 'invalid id!'
        import sys
        sys.exit()
    else:
        id1 = separated_ids[0]
        id_superfamily = '.'.join(separated_ids[0:2])
        id_family = '.'.join(separated_ids[0:3])
        id_subfamily = '.'.join(separated_ids[0:4])
        id_complete = tc_name

        id_id1 = add_one_tc_id(biodb, id1)
        id_superfamily = add_one_tc_id(biodb, id_superfamily)
        id_family = add_one_tc_id(biodb, id_family)
        id_subfamily = add_one_tc_id(biodb, id_subfamily)
        id_complete = add_one_tc_id(biodb, id_complete)
    sql = 'select transporter_id from transporters.transporter_table where transporter_id=%s' % id_complete
    try:
        id = server.adaptor.execute_and_fetchall(sql, )[0][0]
        return id
    except:
        sql = 'insert into transporters.transporter_table (transporter_id, tc1,superfamily, family, subfamily) values ("%s",' \
              ' "%s","%s","%s","%s")' % (id_complete, id_id1, id_superfamily, id_family, id_subfamily)
        server.adaptor.execute(sql, )
        server.commit()
        return id_complete
Ejemplo n.º 11
0
def locus2ko_table(locus_tag2ko_dico, biodatabase, ko_accession2ko_id):

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodatabase)

    sql2 = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodatabase

    locus2seqfeature_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql2))

    sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.seqfeature_id2ko_%s (seqfeature_id INT,' \
           ' ko_id INT, ' \
           ' index ko_id (ko_id),' \
           ' index seqid (seqfeature_id));' % (biodatabase)

    server.adaptor.execute_and_fetchall(sql2, )

    for locus in locus_tag2ko_dico:
        ko = locus_tag2ko_dico[locus]
        ko_id = ko_accession2ko_id[ko]
        seqfeature_id = locus2seqfeature_id[locus]

        sql = 'insert into enzyme.seqfeature_id2ko_%s (seqfeature_id, ko_id) values (%s, %s)' % (
            biodatabase, seqfeature_id, ko_id)

        server.adaptor.execute(sql, )
    server.commit()
Ejemplo n.º 12
0
def import_phylo(phylo_list, biodb):
    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.biosqldb import biosql_own_sql_tables
    import ete3
    import re
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table IF NOT EXISTS biosqldb_phylogenies.BBH_%s (orthogroup varchar(100), phylogeny text, INDEX orthogroup (orthogroup));' % biodb
    server.adaptor.execute(sql, )

    locuslag2orthogroup = biosql_own_sql_tables.locus_tag2orthogroup(biodb)
    l = len(phylo_list)
    for n, phylo in enumerate(phylo_list):
        print("%s/%s" % (n, l))
        t = ete3.Tree(phylo, format=0)
        leaves = [i for i in t.iter_leaves()]
        for leave in leaves:
            try:
                orthogroup = locuslag2orthogroup[leave.name]
                break
            except:
                continue
        sql = 'insert into biosqldb_phylogenies.BBH_%s values ("%s", "%s");' % (
            biodb, orthogroup, t.write())
        try:
            server.adaptor.execute(sql, )
        except:
            print(phylo)
    server.commit()
Ejemplo n.º 13
0
def get_mysql_table(db_name, table_name):
    import numpy as np

    server, db = manipulate_biosqldb.load_db(db_name)

    all_taxons_id = manipulate_biosqldb.get_taxon_id_list(server, db_name)

    sql_taxons = "id, "
    for i in range(0, len(all_taxons_id) - 1):
        sql_taxons += ' `%s`,' % all_taxons_id[i]
    sql_taxons += ' `%s`' % all_taxons_id[-1]

    sql = "select %s from comparative_tables.%s_%s" % (sql_taxons, table_name,
                                                       db_name)
    mat = np.array(server.adaptor.execute_and_fetchall(sql, ))
    f = open("%s_matrix.tab" % table_name, "w")

    taxonid2genome = manipulate_biosqldb.taxon_id2genome_description(
        server, db_name, True)

    taxons_ids = [taxonid2genome[int(i)] for i in all_taxons_id]

    f.write('"id"\t"' + '"\t"'.join(taxons_ids) + '"\n')

    for row in mat:
        row = [str(i) for i in row]
        f.write("\t".join(row) + "\n")
Ejemplo n.º 14
0
def pathway_list2profile_dico(biodb,
                              pathway_list,
                              taxon_id_list=[],
                              group_by_KO=True):
    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb

    db_id = server.adaptor.execute_and_fetchall(sql, )[0][0]

    filter_2 = '"' + '","'.join(pathway_list) + '"'

    # RESTRICT TO AS SUBSET OF THE TAXON AVAILABLE
    if len(taxon_id_list) > 0:
        filter = ','.join(taxon_id_list)


        sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,description,t3.pathway_id,t1.ko_id ' \
              ' from enzyme.locus2ko_%s t1 inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id ' \
              ' inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id ' \
              ' where t3.pathway_name in (%s) and taxon_id in (%s)) A group by taxon_id,pathway_id;' % (biodb,
                                                                                           filter_2,
                                                                                           filter)
        print(sql)
    else:
        if not group_by_KO:
            print('not grouping')
            sql = 'select taxon_id,description,count(*) from (select distinct taxon_id,description,t3.pathway_id,t1.ko_id ' \
                  ' from enzyme.locus2ko_%s t1 inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id ' \
                  ' inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id ' \
                  ' where t3.pathway_name in (%s)) A group by taxon_id,pathway_id;' % (biodb,
                                                                                               filter_2)
        else:
            print('grouping')
            sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,description,t4.ko_accession ' \
                  ' from enzyme.seqfeature_id2ko_%s t1 ' \
                  ' inner join annotation.seqfeature_id2locus_%s tb on t1.seqfeature_id=tb.seqfeature_id ' \
                  ' inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id  inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id' \
                  ' inner join enzyme.ko_annotation t4 on t1.ko_id=t4.ko_id where pathway_name in (%s) ) A ' \
                  ' group by A.taxon_id, A.description;;' % (biodb,
                                                             biodb,
                                                             filter_2)
            print(sql)
    #print sql
    data = server.adaptor.execute_and_fetchall(sql, )

    code2taxon2count = {}
    pathway_list = []
    for row in data:
        row = list(row)
        #print row
        if row[1] not in pathway_list:
            pathway_list.append(row[1])
        if row[1] not in code2taxon2count:
            code2taxon2count[row[1]] = {}
            code2taxon2count[row[1]][str(row[0])] = int(row[2])
        else:
            code2taxon2count[row[1]][str(row[0])] = int(row[2])
    return pathway_list, code2taxon2count
Ejemplo n.º 15
0
def vcf2heatmap(vcf_file, biodb ="saureus_01_15", output="heat_ksnp_SaC.pdf"):
    my_array, sample_names = parse_vcf(vcf_file)
    server, db = manipulate_biosqldb.load_db(biodb)
    accession2description = manipulate_biosqldb.accession2description(server, biodb)

    for i in range(0, len(sample_names)):
        try:
            sample_names[i] = accession2description[sample_names[i]]
        except:
            pass

    M = my_array[:, 0:]
    #print M[0:10,:]
    # tri en fonction de la localisation
    #M = M[M[:, 0].argsort()]

    core = 0
    core_locations = []
    non_core = 0
    for index, value in enumerate(M[:,1:].sum(axis=1)):
        if value > 50:
            core+=1
            core_locations.append(int(M[index,0]))
        else:
            non_core+=1
    print "core", core, len(core_locations)
    print "non core", non_core
    print core/float((core+non_core))

    #M = heatmap.randomize_table(M[:,1:])

    #heatmap.heatmap_ksnp(M, format='png', output=output, breaks="-0.5, 0.5, 5, 99", rows=None, columns = sample_names,orderRows = False)
    return core_locations
Ejemplo n.º 16
0
def load_sport_data(psortdb_output_files,
                    db_name,
                    hash2locus_list):

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(db_name)

    hash2psort_results = {}
    for psortdb_output in psortdb_output_files:
        hash2psort_results.update(parse_psort_results(psortdb_output))
    
    
    sql = f'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_{db_name}'
    locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    
    sql = f'create table if not exists custom_tables.seqfeature_id2psortb_{db_name} (seqfeature_id INT, final_prediction varchar(200), score FLOAT)'
    server.adaptor.execute(sql,)
    
    sql_template = f'insert into custom_tables.seqfeature_id2psortb_{db_name} values (%s, %s, %s)'
    
    for hash in hash2psort_results:
        for locus in hash2locus_list[hash]:
            final_pred = hash2psort_results[hash]['final_prediction']
            if final_pred == 'Unknown':
                score = None
            else:
                score = hash2psort_results[hash][final_pred]
            server.adaptor.execute(sql_template, (locus_tag2seqfeature_id[locus],
                                                  final_pred,
                                                  score))
    
    sql = f'create index sfp on custom_tables.seqfeature_id2psortb_{db_name}(seqfeature_id)'                 
    server.adaptor.execute(sql,)
    server.commit()
Ejemplo n.º 17
0
def del_blastnr_table_content(db_name):
    server, db = manipulate_biosqldb.load_db(db_name)
    sql = 'select accession from bioentry' \
      ' inner join biodatabase on bioentry.biodatabase_id=biodatabase.biodatabase_id where biodatabase.name="%s"' % db_name

    all_accessions = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )]

    for accession in all_accessions:
        sql1 = 'DROP TABLE IF EXISTS blastnr.blastnr_hsps_%s_%s' % (db_name,
                                                                    accession)
        sql2 = 'DROP TABLE IF EXISTS blastnr.blastnr_hits_%s_%s' % (db_name,
                                                                    accession)
        sql3 = 'DROP TABLE IF EXISTS blastnr.blastnr_hits_taxonomy_%s_%s' % (
            db_name, accession)
        sql4 = 'DROP TABLE IF EXISTS blastnr.blastnr_hits_taxonomy_filtered_%s_%s' % (
            db_name, accession)

        print(sql1)
        server.adaptor.execute(sql1)
        server.adaptor.commit()
        print(sql3)
        server.adaptor.execute(sql3)
        server.adaptor.commit()
        print(sql4)
        server.adaptor.execute(sql4)
        server.adaptor.commit()
        print(sql2)
        server.adaptor.execute(sql2)
        server.adaptor.commit()
Ejemplo n.º 18
0
def taxon2module2count(biodb, category=False):

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    if category:
        # C.pathway_category,taxon_id, A.pathway_name,A.n_enzymes, C.description
        # select distinct KO id
        # join with module to get counts/modules
        sql = 'select B.module_sub_cat,A.taxon_id,B.module_name,A.n,B.description from ' \
                            ' (select taxon_id, module_id, count(*) as n from ' \
                            ' (select distinct taxon_id,ko_id from enzyme.locus2ko_%s) t1 ' \
                            ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by taxon_id, module_id) A ' \
                            ' inner join enzyme.kegg_module as B on A.module_id=B.module_id where module_sub_cat="%s";' % (biodb, category)
    else:
        sql = 'select B.module_sub_cat,A.taxon_id,B.module_name,A.n,B.description from ' \
                            ' (select taxon_id, module_id, count(*) as n from ' \
                            ' (select distinct taxon_id,ko_id from enzyme.locus2ko_%s) t1 ' \
                            ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by taxon_id, module_id) A ' \
                            ' inner join enzyme.kegg_module as B on A.module_id=B.module_id;' % (biodb)

    pathway_data = server.adaptor.execute_and_fetchall(sql, )
    taxon2module2c = {}
    # 0 sub cat
    # 1 taxon_id
    # 2 module name
    # 3 count
    # 4 description
    for row in pathway_data:
        if row[1] not in taxon2module2c:
            taxon2module2c[row[1]] = {}
            taxon2module2c[row[1]][row[2]] = row[3]
        else:
            taxon2module2c[row[1]][row[2]] = row[3]
    return taxon2module2c
Ejemplo n.º 19
0
def locus2ko_table_legacy(biodatabase, hash2ko, hash2locus_list):
    # create legacy table locus2ko
    # TODO: remove all depenancies to this table
    # content:
    '''
    taxon_id, locus_tag, orthogroup, ko_id (not numerical id but KO*)
    '''

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodatabase)

    sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodatabase
    sql2 = 'select locus_tag, orthogroup from orthology_detail_%s' % biodatabase

    locus2taxon_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql))
    locus2orthogroup = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql2))

    sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.locus2ko_%s (taxon_id INT,'\
           ' locus_tag VARCHAR(200),' \
           ' orthogroup varchar(200),' \
           ' ko_id VARCHAR(200), index taxon_id (taxon_id), index ko_id (ko_id));' % (biodatabase)

    server.adaptor.execute_and_fetchall(sql2, )
    for hash in hash2ko:
        for locus in hash2locus_list[hash]:
            ko = hash2ko[hash]["KO"]

            sql = 'insert into enzyme.locus2ko_%s (taxon_id, locus_tag, orthogroup, ko_id) values ("%s", "%s", "%s", "%s")' % (
                biodatabase, locus2taxon_id[locus], locus,
                locus2orthogroup[locus], ko)

            server.adaptor.execute(sql, )
        server.commit()
Ejemplo n.º 20
0
def load_reference_phylogeny(db_name, newick_file, gbk_files):
    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.plots import parse_newick_tree

    file_name2taxid = gbk2taxid(gbk_files, db_name)
    print(file_name2taxid)

    server, db = manipulate_biosqldb.load_db(db_name)

    sql = 'select biodatabase_id from biodatabase where name="%s"' % (db_name)

    db_id = server.adaptor.execute_and_fetchall(sql, )[0][0]

    sql = 'create table if not exists reference_phylogeny (biodatabase_id INT, tree TEXT)'

    server.adaptor.execute(sql, )

    with open(newick_file, 'r') as f:
        rows = [i for i in f]
        if len(rows) > 1:
            raise IOError("wrong tree format (need newick)")
        newick_string = rows[0].rstrip()
        new_tree = parse_newick_tree.convert_terminal_node_names(
            newick_string, file_name2taxid, False)

    sql = 'insert into reference_phylogeny values( %s, "%s")' % (
        db_id, new_tree.write())
    server.adaptor.execute(sql, )
    server.commit()
Ejemplo n.º 21
0
def import_eggnog(eggnog_file, biodb):
    from chlamdb.biosqldb import manipulate_biosqldb
    import biosql_own_sql_tables
    import ete2
    import re
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table IF NOT EXISTS COG.eggnog_hits_%s (query_seq varchar(100), ' \
          ' best_hit_eggNOG_ortholog varchar(400),' \
          ' best_hit_evalue FLOAT,' \
          ' best_hit_score FLOAT,' \
          ' predicted_name varchar(100),' \
          ' index query_seq(query_seq));' % biodb
    print sql
    server.adaptor.execute(sql,)
    with open(eggnog_file, 'r') as f:
        for n, one_hit in enumerate(f):
            if n == 0:
                continue
            else:
                data = one_hit.rstrip().split('\t')
                if data[1] == '-':
                    continue
                else:
                    sql = 'insert into  COG.eggnog_hits_%s values ("%s", "%s", %s, %s, "%s");' % (biodb, data[0], data[1], data[2], data[3], data[4])
                    print sql
                    server.adaptor.execute(sql,)
        server.commit()
Ejemplo n.º 22
0
def get_all_orthogroup_protein_fasta(server, biodatabase_name, out_dir):

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodatabase_name)
    sql = 'select orthogroup from orthology_detail_%s group by orthogroup' % biodatabase_name

    all_groups = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )
                  ]  # get_all_orthogroup_size(server, biodatabase_name).keys()
    #all_groups2 =  list(get_all_orthogroup_size(server, biodatabase_name).keys())
    #print all_groups2[0:5], all_groups[0:5]
    #print 'diff', list(set(all_groups2) - set(all_groups))

    #out_q = Queue()
    n_cpu = 8
    n_poc_per_list = int(numpy.ceil(len(all_groups) / float(n_cpu)))
    query_lists = _chunks(all_groups, n_poc_per_list)
    procs = []
    for one_list in query_lists:
        proc = Process(target=get_one_group_data,
                       args=(one_list, biodatabase_name, out_dir))  #, out_q))
        procs.append(proc)
        proc.start()

    print "join proc"
    time.sleep(5)
    for proc in procs:
        proc.join()
Ejemplo n.º 23
0
def import_phylo(phylo_list, biodb):
    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.biosqldb import biosql_own_sql_tables
    import ete3
    import re
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create database if not exists biosqldb_phylogenies'
    server.adaptor.execute(sql, )

    sql = 'create table IF NOT EXISTS biosqldb_phylogenies.%s (orthogroup varchar(100), phylogeny longtext);' % biodb
    server.adaptor.execute(sql, )

    locuslag2orthogroup = biosql_own_sql_tables.locus_tag2orthogroup(biodb)
    l = len(phylo_list)
    for n, phylo in enumerate(phylo_list):
        print("%s/%s" % (n, l))
        t = ete3.Tree(phylo)
        leaves = [i for i in t.iter_leaves()]
        orthogroup = locuslag2orthogroup[leaves[0].name]
        #print t.write()
        sql = 'insert into biosqldb_phylogenies.%s values ("%s", "%s");' % (
            biodb, orthogroup, t.write())
        #print sql
        server.adaptor.execute(sql, )
    server.commit()
Ejemplo n.º 24
0
def load_cog_names_table(table_file):
    '''
    create a table with distinct entries for cogs belonging to multiple categories (i.e NZ_CP007053  | COG2197 | TK)
    :param table_file:
    :return:
        '''
    import re
    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db('chlamydia_04_16')
    sql = 'CREATE table COG.cog_names_2014 (COG_id varchar(100),' \
                                'functon varchar(10),' \
                                'name varchar(200), index COG_id(COG_id))'

    h = re.compile('^#.*')
    server.adaptor.execute_and_fetchall(sql, )

    with open(table_file, 'r') as f:
        for line in f:
            if re.match(h, line):
                continue
            else:
                data = line.rstrip().split('\t')
                cog = data[0]
                all_funct = list(data[1])
                name = re.sub('"', '', data[2])
                for one_category in all_funct:
                    sql = 'insert into COG.cog_names_2014 (COG_id, functon, name) values ("%s", "%s", "%s")' % (
                        cog, one_category, name)
                    print sql
                    server.adaptor.execute(sql, )
                    server.commit()
Ejemplo n.º 25
0
def module_list2profile_dico(biodb, module_list, taxon_id_list=[]):

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb

    db_id = server.adaptor.execute_and_fetchall(sql, )[0][0]

    filter_2 = '"' + '","'.join(module_list) + '"'

    # RESTRICT TO AS SUBSET OF THE TAXON AVAILABLE
    if len(taxon_id_list) > 0:
        filter = ','.join(taxon_id_list)


        sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,t1.ko_id,module_name,description ' \
              ' from enzyme.seqfeature_id2ko_%s t1' \
              ' inner join enzyme.module2ko t2 on t1.ko_id=t2.ko_id ' \
              ' inner join enzyme.kegg_module as t3 on t2.module_id=t3.module_id ' \
              ' inner join annotation.seqfeature_id2locus_%s t4 ' \
              ' on t1.seqfeature_id=t4.seqfeature_id' \
              ' where module_name in (%s) and taxon_id in (%s)) A group by A.taxon_id,A.module_name;' % (biodb,
                                                                                                         biodb,
                                                                                                         filter_2,
                                                                                                         filter)
    else:
        sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,t1.ko_id,module_name,description ' \
              ' from enzyme.seqfeature_id2ko_%s t1' \
              ' inner join enzyme.module2ko t2 on t1.ko_id=t2.ko_id ' \
              ' inner join enzyme.kegg_module as t3 on t2.module_id=t3.module_id ' \
              ' inner join annotation.seqfeature_id2locus_%s t4 ' \
              ' on t1.seqfeature_id=t4.seqfeature_id' \
              ' where module_name in (%s)) A group by A.taxon_id,A.module_name;' % (biodb,
                                                                                                         biodb,
                                                                                                         filter_2)
    print(sql)
    data = server.adaptor.execute_and_fetchall(sql, )

    code2taxon2count = {}
    module_list = []
    for row in data:
        row = list(row)
        '''
        split_name = row[1].split('=>')
        if len(split_name[0]) >0:
            row[1] = split_name[0]
        '''
        #else:

        if row[1] not in module_list:
            module_list.append(row[1])
        if row[1] not in code2taxon2count:
            code2taxon2count[row[1]] = {}
            code2taxon2count[row[1]][str(row[0])] = int(row[2])
        else:
            code2taxon2count[row[1]][str(row[0])] = int(row[2])
    return module_list, code2taxon2count
Ejemplo n.º 26
0
def create_taxon_link_table(biodb):
    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table interactions.colocalization_taxons_table_%s (group_1 varchar(200), group_2 varchar(200), taxon_1 INT,' \
          'taxon_2 INT)' % biodb

    server.adaptor.execute(sql, )
Ejemplo n.º 27
0
def create_group_link_table(biodb):
    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table interactions.colocalization_table_%s (group_1 varchar(200), group_2 varchar(200), n_links INT,' \
          'n_comparisons INT, ratio float)' % biodb

    server.adaptor.execute(sql, )
Ejemplo n.º 28
0
def load_PMID(PMID_db_path, hash2locus_list, db_name):

    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.biosqldb import biosql_own_sql_tables
    import sqlite3

    sqlite_conn = sqlite3.connect(PMID_db_path)
    sqlite_cursor = sqlite_conn.cursor()

    server, db = manipulate_biosqldb.load_db(db_name)

    sql1 = 'create table string.seqfeature_id2pmid_%s (seqfeature_id INTEGER, pmid INTEGER);' % db_name
    sql2 = 'create table if not exists string.pmid2data (pmid INTEGER, title TEXT, authors TEXT, source TEXT, abstract TEXT);'

    server.adaptor.execute(sql1, )
    server.adaptor.execute(sql2, )

    # get locus_tag2seqfeaute_id dictionnary
    sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % db_name
    locus_tag2seqfeaure_id = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))
    # retrieve existing pmid data (if any)
    sql = 'select pmid from string.pmid2data '
    pmid_already_in_db = [
        str(i[0]) for i in server.adaptor.execute_and_fetchall(sql, )
    ]

    sql = 'select * from hash2pmid'
    hash_and_pmid = sqlite_cursor.execute(sql, ).fetchall()

    sql = 'select * from pmid2data'
    pmid2data = manipulate_biosqldb.to_dict(
        sqlite_cursor.execute(sql, ).fetchall())

    template1 = 'insert into string.pmid2data values (%s, %s, %s, %s, %s)'
    template2 = 'insert into string.seqfeature_id2pmid_%s' % db_name + ' values(%s, %s)'
    for n, row in enumerate(hash_and_pmid):
        hash, pmid = row
        locus_list = hash2locus_list[hash]
        if pmid not in pmid_already_in_db:
            try:
                server.adaptor.execute(template1,
                                       [pmid] + list(pmid2data[str(pmid)]))
                pmid_already_in_db.append(pmid)
            except KeyError:
                print("Problem with pmid %s" % pmid)
        for locus_tag in locus_list:
            server.adaptor.execute(template2,
                                   [locus_tag2seqfeaure_id[locus_tag], pmid])
    server.commit()

    sql1 = 'create index sf on string.seqfeature_id2pmid_%s(seqfeature_id)' % db_name
    sql2 = 'create index pm1 on string.seqfeature_id2pmid_%s(pmid)' % db_name
    sql3 = 'create index pm2 on string.pmid2data (pmid)'

    server.adaptor.execute(sql1, )
    server.adaptor.execute(sql2, )
    server.adaptor.execute(sql3, )
Ejemplo n.º 29
0
def get_all_accessions(db_name):
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(db_name)

    sql = 'select accession from bioentry t1 inner join biodatabase t2 on t1.biodatabase_id=t2.biodatabase_id ' \
          ' where t2.name="%s"' % db_name
    accession_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )]
    return accession_list
Ejemplo n.º 30
0
def orthogroup2nucleotide_seq_list(locus_tag2nucl_sequence, group, biodb_name):
    server = manipulate_biosqldb.load_db()
    locus_list = manipulate_biosqldb.orthogroup_id2locus_tag_list(
        server, group, biodb_name)
    seqs = []
    for locus in locus_list:
        seq = locus_tag2nucl_sequence[str(locus[2])]
        seqs.append(locus_tag2nucl_sequence[str(locus[2])])
    return seqs