예제 #1
0
def get_orthology_matrix_merging_plasmids_biosqldb(server, biodatabase_name):

    all_orthogroups = get_all_orthogroup_size(server, biodatabase_name)
    #server, db = manipulate_biosqldb.load_db(biodatabase_name)
    #sql='select orthogroup, count(*) from biosqldb.orthology_detail_%s group by orthogroup' % biodatabase_name

    # based on the new orthology detail table
    #all_orthogroups = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))

    n_groups = len(all_orthogroups.keys())
    print("ngroups", n_groups)

    all_taxons = manipulate_biosqldb.get_taxon_id_list(server,
                                                       biodatabase_name)
    ##print all_taxons
    #print 'get dico ortho count'
    detailed_orthology_count = {}
    for group in all_orthogroups.keys():
        detailed_orthology_count[group] = {}
        for taxon_id in all_taxons:
            detailed_orthology_count[group][int(taxon_id)] = 0

    for taxon_id in all_taxons:
        ##print taxon_id
        dico = manipulate_biosqldb.taxon_id2orthogroup_size(
            server, biodatabase_name, taxon_id)
        ##print dico
        #import time
        #time.sleep(3)
        ##print "taxon", taxon_id
        for group in dico.keys():
            detailed_orthology_count[group][int(taxon_id)] += dico[group]
    #print 'count ok'
    ##print detailed_orthology_count
    return detailed_orthology_count
예제 #2
0
def get_mysql_table(db_name, table_name):
    import numpy as np

    server, db = manipulate_biosqldb.load_db(db_name)

    all_taxons_id = manipulate_biosqldb.get_taxon_id_list(server, db_name)

    sql_taxons = "id, "
    for i in range(0, len(all_taxons_id) - 1):
        sql_taxons += ' `%s`,' % all_taxons_id[i]
    sql_taxons += ' `%s`' % all_taxons_id[-1]

    sql = "select %s from comparative_tables.%s_%s" % (sql_taxons, table_name,
                                                       db_name)
    mat = np.array(server.adaptor.execute_and_fetchall(sql, ))
    f = open("%s_matrix.tab" % table_name, "w")

    taxonid2genome = manipulate_biosqldb.taxon_id2genome_description(
        server, db_name, True)

    taxons_ids = [taxonid2genome[int(i)] for i in all_taxons_id]

    f.write('"id"\t"' + '"\t"'.join(taxons_ids) + '"\n')

    for row in mat:
        row = [str(i) for i in row]
        f.write("\t".join(row) + "\n")
예제 #3
0
def collect_COGs(db_name):

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(db_name)

    taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name)

    sql_head = 'INSERT INTO comparative_tables.COG_%s (id,' % db_name

    for taxon in taxon_id_list:
        sql_head += '`%s`,' % taxon
    sql_head = sql_head[0:-1] + ') values ('

    all_cogs_ids_sql = 'select COG_id from COG.locus_tag2gi_hit_%s group by COG_id;' % db_name

    all_cogs_ids = [
        i[0] for i in server.adaptor.execute_and_fetchall(all_cogs_ids_sql, )
    ]

    database_id = server.adaptor.execute_and_fetchall(
        'select biodatabase_id from biosqldb.biodatabase where name="%s"' %
        db_name)[0][0]

    i = 0
    for accession in all_cogs_ids:
        print(i, '/', len(all_cogs_ids), accession)
        i += 1
        sql = 'select B.taxon_id, A.count from (select accession,count(*) as count from ' \
             'COG.locus_tag2gi_hit_%s ' \
             'where COG_id = "%s" group by accession) A ' \
             'left join biosqldb.bioentry as B on A.accession=B.accession and biodatabase_id = %s;' % (db_name, accession, database_id)
        data = server.adaptor.execute_and_fetchall(sql, )

        taxon2count = {}
        for n in data:
            if n[0] not in taxon2count:
                taxon2count[n[0]] = int(n[1])
            else:
                taxon2count[n[0]] += int(n[1])

        sql_temp = sql_head + '"%s",' % accession

        for taxon in taxon_id_list:
            try:
                sql_temp += '%s,' % taxon2count[int(taxon)]
            except:
                sql_temp += '0,'
        sql_temp = sql_temp[0:-1] + ');'

        server.adaptor.execute(sql_temp, )
        server.adaptor.commit()
예제 #4
0
def collect_EC(db_name):

    from chlamdb.biosqldb import manipulate_biosqldb



    server, db = manipulate_biosqldb.load_db(db_name)

    sql_db_id = 'select biodatabase_id from biodatabase where name="%s"' % db_name

    biodb_id = server.adaptor.execute_and_fetchall(sql_db_id,)[0][0]

    print "biodb_id", biodb_id

    taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name)

    sql_head = 'INSERT INTO comparative_tables.EC_%s (id,' % db_name

    for taxon in taxon_id_list:
        sql_head += '`%s`,' % taxon
    sql_head = sql_head[0:-1] + ') values ('

    all_EC_ids_sql = 'select distinct ec from enzyme.locus2ec_%s as t1 inner join enzyme.enzymes as t2 on t1.ec_id=t2.enzyme_id;' % (db_name)

    all_ec_ids = [i[0] for i in server.adaptor.execute_and_fetchall(all_EC_ids_sql,)]

    i = 0
    for accession in all_ec_ids:
        print i,'/', len(all_ec_ids), accession
        i+=1

        sql = 'select taxon_id, count(*) from (select taxon_id, t1.accession, ec_id from enzyme.locus2ec_%s as t1 ' \
                     ' inner join biosqldb.bioentry as t2 on t1.accession=t2.accession where biodatabase_id=%s) A ' \
                     ' inner join enzyme.enzymes as B on A.ec_id=B.enzyme_id where ec="%s" group by taxon_id;' % (db_name, biodb_id, accession)

        print sql
        #sql= 'select taxon_id, count(*) from enzyme.locus2ec_%s ' \
        #     ' where ec = "%s" group by taxon_id;' % (db_name, accession)

        data = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
        sql_temp = sql_head + '"%s",' % accession

        for taxon in taxon_id_list:
            try:
                sql_temp += '%s,' % data[str(taxon)]
            except:
                sql_temp += '0,'
        sql_temp = sql_temp[0:-1] + ');'

        print sql_temp
        server.adaptor.execute(sql_temp,)
        server.adaptor.commit()
예제 #5
0
def biodb2randomized_matrix(bio_db_name):
    server, db = manipulate_biosqldb.load_db(bio_db_name)
    matrix = np.array(
        manipulate_biosqldb.get_orthology_table(server, bio_db_name))

    taxon_id2description = manipulate_biosqldb.taxon_id2genome_description(
        server, bio_db_name)
    #print taxon_id2description

    #group_names = matrix[:,0]
    taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name)

    print 'Number of taxons:', len(taxons_ids)
    taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids]

    import re
    for i, accession in enumerate(taxons_ids):
        #print i, accession
        description = taxons_ids[i]
        description = re.sub(", complete genome\.", "", description)
        description = re.sub(", complete genome", "", description)
        description = re.sub(", complete sequence\.", "", description)
        description = re.sub("strain ", "", description)
        description = re.sub("str\. ", "", description)
        description = re.sub(" complete genome sequence\.", "", description)
        description = re.sub(" complete genome\.", "", description)
        description = re.sub(" chromosome", "", description)
        description = re.sub(" DNA", "S.", description)
        description = re.sub("Merged record from ", "", description)
        description = re.sub(", wgs", "", description)
        description = re.sub("Candidatus ", "", description)
        description = re.sub(".contig.0_1, whole genome shotgun sequence.", "",
                             description)
        description = re.sub("Protochlamydia", "P.", description)
        description = re.sub("Chlamydia", "C.", description)
        description = re.sub("Chlamydophila", "E.", description)
        description = re.sub("Estrella", "E.", description)
        description = re.sub("Rhodopirellula", "R.", description)
        description = re.sub("Methylacidiphilum", "M.", description)
        description = re.sub(" phage", "", description)
        description = re.sub("Parachlamydia", "P.", description)
        description = re.sub("Neochlamydia", "Neo.", description)
        description = re.sub("Simkania", "S.", description)
        description = re.sub("Waddlia", "W.", description)
        description = re.sub("Pirellula", "P.", description)
        description = re.sub("Rhabdochlamydiaceae sp.", "Rhabdo", description)
        taxons_ids[i] = description

    M = matrix.astype(float)  # [:, 1:]
    M = heatmap.randomize_table(M)
    return (M, taxons_ids)
예제 #6
0
def create_comparative_tables(db_name, table_name):

    # create id column + one_column per taxon_id
    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(db_name)

    taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name)

    sql = "CREATE TABLE comparative_tables.%s_%s(id VARCHAR(100) NOT NULL" % (table_name, db_name)
    for i in taxon_id_list:
        sql+=" ,`%s` INT" % i
    sql+=")"
    server.adaptor.execute(sql)
예제 #7
0
def get_conserved_core_groups(server, biodatabase_name):
    all_taxons_id = manipulate_biosqldb.get_taxon_id_list(
        server, biodatabase_name)

    sql_taxons = ""
    for i in range(0, len(all_taxons_id) - 1):
        sql_taxons += ' `%s` = 1 and' % all_taxons_id[i]
    sql_taxons += ' `%s` = 1' % all_taxons_id[-1]

    sql = "select orthogroup from orthology_%s where %s;" % (biodatabase_name,
                                                             sql_taxons)
    #print sql
    result = server.adaptor.execute_and_fetchall(sql, )

    return [i[0] for i in result]
예제 #8
0
def collect_EC(db_name):

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(db_name)

    sql_db_id = 'select biodatabase_id from biodatabase where name="%s"' % db_name

    biodb_id = server.adaptor.execute_and_fetchall(sql_db_id, )[0][0]

    taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name)

    sql_head = 'INSERT INTO comparative_tables.EC_%s (id,' % db_name

    for taxon in taxon_id_list:
        sql_head += '`%s`,' % taxon
    sql_head = sql_head[0:-1] + ') values ('

    all_EC_ids_sql = 'select distinct ec from enzyme.seqfeature_id2ec_%s t1 inner join  enzyme.enzymes t2 on t1.ec_id=t2.enzyme_id;' % (
        db_name)

    all_ec_ids = [
        i[0] for i in server.adaptor.execute_and_fetchall(all_EC_ids_sql, )
    ]

    i = 0
    for accession in all_ec_ids:
        print(i, '/', len(all_ec_ids), accession)
        i += 1

        sql = 'select taxon_id,count(*) as n from enzyme.seqfeature_id2ec_%s t1 inner join enzyme.enzymes t2 on t1.ec_id=t2.enzyme_id inner join annotation.seqfeature_id2locus_%s t3 on t1.seqfeature_id=t3.seqfeature_id where ec="%s" group by taxon_id,ec;' % (
            db_name, db_name, accession)

        data = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))
        sql_temp = sql_head + '"%s",' % accession

        for taxon in taxon_id_list:
            try:
                sql_temp += '%s,' % data[str(taxon)]
            except:
                sql_temp += '0,'
        sql_temp = sql_temp[0:-1] + ');'

        server.adaptor.execute(sql_temp, )
        server.adaptor.commit()
예제 #9
0
def collect_ko(db_name):

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(db_name)

    sql_db_id = 'select biodatabase_id from biodatabase where name="%s"' % db_name

    biodb_id = server.adaptor.execute_and_fetchall(sql_db_id, )[0][0]

    taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name)

    sql_head = 'INSERT INTO comparative_tables.ko_%s (id,' % db_name

    for taxon in taxon_id_list:
        sql_head += '`%s`,' % taxon
    sql_head = sql_head[0:-1] + ') values ('

    all_ko_ids_sql = 'select distinct ko_id from enzyme.locus2ko_%s;' % (
        db_name)

    all_ko_ids = [
        i[0] for i in server.adaptor.execute_and_fetchall(all_ko_ids_sql, )
    ]

    i = 0
    for accession in all_ko_ids:
        print(i, '/', len(all_ko_ids), accession)
        i += 1

        sql = 'select taxon_id, count(*) from enzyme.locus2ko_%s where ko_id="%s" group by taxon_id;' % (
            db_name, accession)

        data = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))
        sql_temp = sql_head + '"%s",' % accession

        for taxon in taxon_id_list:
            try:
                sql_temp += '%s,' % data[str(taxon)]
            except:
                sql_temp += '0,'
        sql_temp = sql_temp[0:-1] + ');'

        server.adaptor.execute(sql_temp, )
        server.adaptor.commit()
예제 #10
0
def collect_interpro(db_name):

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(db_name)

    taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name)

    sql_head = 'INSERT INTO comparative_tables.interpro_%s (id,' % db_name

    for taxon in taxon_id_list:
        sql_head += '`%s`,' % taxon
    sql_head = sql_head[0:-1] + ') values ('

    import sys
    #sys.exit()

    all_interpro_ids_sql = 'select interpro_accession from biosqldb.interpro_%s ' \
                       ' where interpro_accession != "0" group by interpro_accession;' % db_name

    all_interpro_ids = [
        i[0]
        for i in server.adaptor.execute_and_fetchall(all_interpro_ids_sql, )
    ]

    i = 0
    for accession in all_interpro_ids:
        print(i, '/', len(all_interpro_ids), accession)
        i += 1
        sql= 'select A.taxon_id, count(*) as n from (select taxon_id, locus_tag, interpro_accession ' \
             ' from biosqldb.interpro_%s where interpro_accession="%s" group by locus_tag) A group by taxon_id;' % (db_name, accession)

        data = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))

        sql_temp = sql_head + '"%s",' % accession

        for taxon in taxon_id_list:
            try:
                sql_temp += '%s,' % data[str(taxon)]
            except:
                sql_temp += '0,'
        sql_temp = sql_temp[0:-1] + ');'

        server.adaptor.execute(sql_temp, )
        server.adaptor.commit()
예제 #11
0
def write_ortho_matrix(bio_db_name):
    server, db = manipulate_biosqldb.load_db(bio_db_name)
    matrix = np.array(
        manipulate_biosqldb.get_orthology_table(server, bio_db_name))

    taxon_id2description = manipulate_biosqldb.taxon_id2genome_description(
        server, bio_db_name)

    group_names = matrix[:, 0]
    taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name)

    import re
    for i in taxon_id2description.keys():
        taxon_id2description[i] = re.sub(" subsp\. aureus", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(", complete genome\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(", complete sequence\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub("strain ", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i])
        taxon_id2description[i] = re.sub(" complete genome sequence\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(" complete genome\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(" chromosome", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(" DNA", "", taxon_id2description[i])

    taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids]
    #print taxon_id2description
    f = open("ortho_matrix.tab", "w")

    f.write("orthogroup\t" + "\t".join(taxons_ids) + "\n")

    for row in matrix:
        f.write("\t".join(row) + "\n")
    f.close()
예제 #12
0
def collect_pfam(db_name):

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(db_name)
    taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name)
    sql_head = 'INSERT INTO comparative_tables.Pfam_%s (id,' % db_name

    for taxon in taxon_id_list:
        sql_head += '`%s`,' % taxon
    sql_head = sql_head[0:-1] + ') values ('

    all_pfam_ids_sql = 'select signature_accession from interpro_%s where analysis="Pfam" ' \
                       'group by signature_accession;' % db_name

    all_pfam_ids = [
        i[0] for i in server.adaptor.execute_and_fetchall(all_pfam_ids_sql, )
    ]

    i = 0
    for accession in all_pfam_ids:
        print(i, '/', len(all_pfam_ids), accession)
        i += 1
        sql = 'select taxon_id, count(*) from biosqldb.interpro_%s ' \
              ' where analysis="Pfam" and signature_accession="%s" group by taxon_id;' % (db_name, accession)

        data = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))

        sql_temp = sql_head + '"%s",' % accession

        for taxon in taxon_id_list:
            try:
                sql_temp += '%s,' % data[str(taxon)]
            except:
                sql_temp += '0,'
        sql_temp = sql_temp[0:-1] + ');'

        server.adaptor.execute(sql_temp, )
        server.adaptor.commit()
예제 #13
0
def plot_orthogroup_size_distrib(server,
                                 biodatabase_name,
                                 out_name="orthogroup_size_distrib.svg",
                                 taxon_id=False):

    from collections import Counter
    import numpy as np
    import pandas
    import itertools
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    import math
    import re

    if not taxon_id:

        all_taxons = manipulate_biosqldb.get_taxon_id_list(
            server, biodatabase_name)
        for key in all_taxons:
            sql = 'select description from bioentry where taxon_id=%s and description not like "%%%%plasmid%%%%" limit 1;'
            #print sql, key
            description = server.adaptor.execute_and_fetchall(sql, key)[0][0]

        #print all_taxons
        all_data = {}
        for taxon in all_taxons:
            sql = 'select orthogroup,`%s` from comparative_tables.orthology_%s where `%s` >0' % (
                taxon, biodatabase_name, taxon)
            result = manipulate_biosqldb._to_dict(
                server.adaptor.execute_and_fetchall(sql, ))
            all_data[taxon] = Counter(result.values())

        # get maximum group size and max number of proteins for a single group
        max_orthogroup_size = 0
        max_n_groups = 0
        for key in all_data.keys():
            dico = all_data[key]
            temp = int(max(dico.keys()))
            temp2 = int(max(dico.values()))
            if temp > max_orthogroup_size:
                max_orthogroup_size = temp
            if temp2 > max_n_groups:
                max_n_groups = temp2

        #for key in all_data.keys():
        #    dico= all_data[key]
        #    for i in range(1, max_orthogroup_size):
        #        if i not in dico.keys():
        #            dico[i] = 0

        # transform into pandas serie objects
        complete_data = {}
        for key in all_data.keys():
            dico = all_data[key]
            df = pandas.Series(dico)
            complete_data[key] = df

        # create figure with one subplot per genome
        # plot per genome group size distribution
        n = int(math.ceil(len(all_taxons) / 2.0))
        #print n
        if n % 2 == 0:
            n += 1
        fig, axes = plt.subplots(nrows=n, ncols=2)
        for key, location in zip(complete_data.keys(),
                                 list(itertools.product(range(n), repeat=2))):
            serie = complete_data[key]
            sql = 'select description from bioentry where taxon_id=%s and description not like "%%%%plasmid%%%%" limit 1;'
            #print sql, key
            description = server.adaptor.execute_and_fetchall(sql, key)[0][0]
            #print description

            description = re.sub(", complete genome\.", "", description)
            description = re.sub(", complete genome", "", description)
            description = re.sub(", complete sequence\.", "", description)
            description = re.sub("strain ", "", description)
            description = re.sub("str\. ", "", description)
            description = re.sub(" complete genome sequence\.", "",
                                 description)
            description = re.sub(" complete genome\.", "", description)
            description = re.sub(" chromosome", "", description)
            description = re.sub(" DNA", "S.", description)
            description = re.sub("Merged record from ", "", description)
            description = re.sub(", wgs", "", description)
            description = re.sub("Candidatus ", "", description)
            description = re.sub(".contig.0_1, whole genome shotgun sequence.",
                                 "", description)
            p = serie.plot(ax=axes[location[1], location[0]],
                           kind="bar",
                           alpha=1,
                           color='r')
            p.set_ylim(-100, max_n_groups + 0.1 * max_n_groups)
            p.set_title(description)

            for rect in p.patches:
                height = rect.get_height()
                p.text(rect.get_x() + rect.get_width() / 2.,
                       height + 0.02 * max_n_groups,
                       int(height),
                       ha='center',
                       va='bottom')

        # plot all genomes group size distribution
        #server, db = manipulate_biosqldb.load_db(db_name)
        sql = 'select orthogroup, count(*) from biosqldb.orthology_detail_%s group by orthogroup' % biodatabase_name

        # based on the new orthology detail table
        all_grp_size = manipulate_biosqldb.to_dict(
            server.adaptor.execute_and_fetchall(sql, ))
        # based on biosqldb tables
        #all_grp_size = get_all_orthogroup_size(server, biodatabase_name)

        df = pandas.Series(Counter(all_grp_size.values()))
        p = df.plot(kind="bar", alpha=0.5, color='b')
        p.set_ylim(-100, max(df) + 0.1 * max(df))
        p.set_title("Orthogroup size distribution")

        # add group size as text at the top of the bars
        for rect in p.patches:
            height = rect.get_height()
            p.text(rect.get_x() + rect.get_width() / 2.,
                   height + 0.02 * max_n_groups,
                   int(height),
                   ha='center',
                   va='bottom')
        fig.set_size_inches(18, 6 * n)
        plt.savefig(out_name, format="svg")

    if taxon_id:
        #print "ID!!!"
        sql = 'select orthogroup,`%s` from %s where `%s` >0'
        othogroup_id2size = manipulate_biosqldb._to_dict(
            server.adaptor.execute_and_fetchall(
                sql, (taxon_id, biodatabase_name, taxon_id)))

        serie = pandas.Series(Counter(othogroup_id2size.values()))
        p = serie.plot(kind="bar", alpha=0.5, color='b')
        p.set_ylim(-100, max(serie) + 0.1 * max(serie))
        sql = 'select description from bioentry where taxon_id=%s limit 1;'
        description = server.adaptor.execute_and_fetchall(sql, taxon_id)
        p.set_title(description[0][0])

        for rect in p.patches:
            height = rect.get_height()
            p.text(rect.get_x() + rect.get_width() / 2.,
                   height + 0.02 * max(serie),
                   int(height),
                   ha='center',
                   va='bottom')
        #plt.show()
        plt.savefig(out_name, format="svg")
예제 #14
0
        print 'getting single copy core groups'
        core_ortho = get_conserved_core_groups(server, args.db_name)
        import shutil
        for group in core_ortho:
            shutil.copy(
                os.path.join(
                    asset_path,
                    "%s_fasta_by_taxons/%s.txt" % (args.db_name, group)),
                os.path.join(asset_path,
                             "%s_fasta_core/%s.txt" % (args.db_name, group)))

        get_nucleotide_core_fasta(server, db, args.db_name, "./nucl")

    if args.core_groups_path:
        taxon_ids = manipulate_biosqldb.get_taxon_id_list(server, args.db_name)

        sql_include = ''
        if len(taxon_ids) > 0:
            for i in range(0, len(taxon_ids) - 1):
                sql_include += ' `%s` = 1 and ' % taxon_ids[i]
            sql_include += '`%s` = 1' % taxon_ids[-1]
        sql = 'select orthogroup from orthology_%s where %s' % (args.db_name,
                                                                sql_include)
        print sql
        group_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )]
        locus_or_protein_id2taxon_id = manipulate_biosqldb.locus_or_protein_id2taxon_id(
            server, args.db_name)
        print 'Number of core groups: %s' % len(group_list)
        get_one_group_data_taxon(group_list, locus_or_protein_id2taxon_id,
                                 args.db_name, args.core_groups_path)