Ejemplo n.º 1
0
def vcf2heatmap(vcf_file, biodb="saureus_01_15", output="heat_ksnp_SaC.pdf"):
    my_array, sample_names = parse_vcf(vcf_file)
    server, db = manipulate_biosqldb.load_db(biodb)
    accession2description = manipulate_biosqldb.accession2description(
        server, biodb)

    for i in range(0, len(sample_names)):
        try:
            sample_names[i] = accession2description[sample_names[i]]
        except:
            pass

    M = my_array[:, 0:]
    #print M[0:10,:]
    # tri en fonction de la localisation
    #M = M[M[:, 0].argsort()]

    core = 0
    core_locations = []
    non_core = 0
    for index, value in enumerate(M[:, 1:].sum(axis=1)):
        if value > 50:
            core += 1
            core_locations.append(int(M[index, 0]))
        else:
            non_core += 1
    print "core", core, len(core_locations)
    print "non core", non_core
    print core / float((core + non_core))

    #M = heatmap.randomize_table(M[:,1:])

    #heatmap.heatmap_ksnp(M, format='png', output=output, breaks="-0.5, 0.5, 5, 99", rows=None, columns = sample_names,orderRows = False)
    return core_locations
def taxon2module2count(biodb, category=False):

    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    if category:
        # C.pathway_category,taxon_id, A.pathway_name,A.n_enzymes, C.description
        # select distinct KO id
        # join with module to get counts/modules
        sql = 'select B.module_sub_cat,A.taxon_id,B.module_name,A.n,B.description from ' \
                            ' (select taxon_id, module_id, count(*) as n from ' \
                            ' (select distinct taxon_id,ko_id from enzyme.locus2ko_%s) t1 ' \
                            ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by taxon_id, module_id) A ' \
                            ' inner join enzyme.kegg_module as B on A.module_id=B.module_id where module_sub_cat="%s";' % (biodb, category)
    else:
        sql = 'select B.module_sub_cat,A.taxon_id,B.module_name,A.n,B.description from ' \
                            ' (select taxon_id, module_id, count(*) as n from ' \
                            ' (select distinct taxon_id,ko_id from enzyme.locus2ko_%s) t1 ' \
                            ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by taxon_id, module_id) A ' \
                            ' inner join enzyme.kegg_module as B on A.module_id=B.module_id;' % (biodb)

    pathway_data = server.adaptor.execute_and_fetchall(sql, )
    taxon2module2c = {}
    # 0 sub cat
    # 1 taxon_id
    # 2 module name
    # 3 count
    # 4 description
    for row in pathway_data:
        if row[1] not in taxon2module2c:
            taxon2module2c[row[1]] = {}
            taxon2module2c[row[1]][row[2]] = row[3]
        else:
            taxon2module2c[row[1]][row[2]] = row[3]
    return taxon2module2c
Ejemplo n.º 3
0
def pathway_list2profile_dico(biodb,
                              pathway_list,
                              taxon_id_list=[],
                              group_by_KO=True):
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb

    db_id = server.adaptor.execute_and_fetchall(sql, )[0][0]

    filter_2 = '"' + '","'.join(pathway_list) + '"'

    # RESTRICT TO AS SUBSET OF THE TAXON AVAILABLE
    if len(taxon_id_list) > 0:
        filter = ','.join(taxon_id_list)


        sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,description,t3.pathway_id,t1.ko_id ' \
              ' from enzyme.locus2ko_%s t1 inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id ' \
              ' inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id ' \
              ' where t3.pathway_name in (%s) and taxon_id in (%s)) A group by taxon_id,pathway_id;' % (biodb,
                                                                                           filter_2,
                                                                                           filter)
        print(sql)
    else:
        if not group_by_KO:
            print('not grouping')
            sql = 'select taxon_id,description,count(*) from (select distinct taxon_id,description,t3.pathway_id,t1.ko_id ' \
                  ' from enzyme.locus2ko_%s t1 inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id ' \
                  ' inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id ' \
                  ' where t3.pathway_name in (%s)) A group by taxon_id,pathway_id;' % (biodb,
                                                                                               filter_2)
        else:
            print('grouping')
            sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,description,t4.ko_accession ' \
                  ' from enzyme.seqfeature_id2ko_%s t1 ' \
                  ' inner join annotation.seqfeature_id2locus_%s tb on t1.seqfeature_id=tb.seqfeature_id ' \
                  ' inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id  inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id' \
                  ' inner join enzyme.ko_annotation t4 on t1.ko_id=t4.ko_id where pathway_name in (%s) ) A ' \
                  ' group by A.taxon_id, A.description;;' % (biodb,
                                                             biodb,
                                                             filter_2)
            print(sql)
    #print sql
    data = server.adaptor.execute_and_fetchall(sql, )

    code2taxon2count = {}
    pathway_list = []
    for row in data:
        row = list(row)
        #print row
        if row[1] not in pathway_list:
            pathway_list.append(row[1])
        if row[1] not in code2taxon2count:
            code2taxon2count[row[1]] = {}
            code2taxon2count[row[1]][str(row[0])] = int(row[2])
        else:
            code2taxon2count[row[1]][str(row[0])] = int(row[2])
    return pathway_list, code2taxon2count
def get_annotation(accession, biodb):

    import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    locus2COG = {}
    locus2KEGG = {}

    sql1 = 'select locus_tag,t1.COG_id,functon,name from COG_locus_tag2gi_hit t1 ' \
           ' inner join COG_cog_names_2014 t2 on t1.COG_id=t2.COG_id where accession="%s";' % (biodb, accession)

    for row in server.adaptor.execute_and_fetchall(sql1, ):
        if row[0] not in locus2COG:
            locus2COG[row[0]] = [row[1:]]
        else:
            locus2COG[row[0]].append(row[1:])

    sql2 = 'select t1.locus_tag, t3.ko_id,t6.definition,t5.pathway_name,t5.description from orthology_detail t1 ' \
           ' inner join enzyme_locus2ko t3 on t1.locus_tag=t3.locus_tag ' \
           ' inner join enzyme_pathway2ko t4 on t3.ko_id=t4.ko_id ' \
           ' inner join enzyme_kegg_pathway t5 on t4.pathway_id=t5.pathway_id inner join enzyme_ko_annotation t6 on t3.ko_id=t6.ko_id' \
           ' where t1.accession="%s";' % (biodb,
                                                                                                    biodb,
                                                                                                    accession)
    for row in server.adaptor.execute_and_fetchall(sql2, ):
        if row[0] not in locus2KEGG:
            locus2KEGG[row[0]] = [row[1:]]
        else:
            locus2KEGG[row[0]].append(row[1:])
    return locus2COG, locus2KEGG
Ejemplo n.º 5
0
def create_taxon_link_table(biodb):
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table interactions.colocalization_taxons_table_%s (group_1 varchar(200), group_2 varchar(200), taxon_1 INT,' \
          'taxon_2 INT)' % biodb

    server.adaptor.execute(sql,)
Ejemplo n.º 6
0
def create_group_link_table(biodb):
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table interactions.colocalization_table_%s (group_1 varchar(200), group_2 varchar(200), n_links INT,' \
          'n_comparisons INT, ratio float)' % biodb

    server.adaptor.execute(sql,)
Ejemplo n.º 7
0
def get_phylogeny(biodb):
    import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql_tree = 'select tree from reference_phylogeny as t1 inner join biodatabase as t2 on t1.biodatabase_id=t2.biodatabase_id where name="%s";' % biodb

    tree = server.adaptor.execute_and_fetchall(sql_tree)[0][0]
    return tree
Ejemplo n.º 8
0
def create_locus_link_table(biodb):
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table interactions.colocalization_table_locus_%s (locus_1 varchar(200), locus_2 varchar(200), n_links INT,' \
          'n_comparisons INT, ratio float, index locus_1 (locus_1), index locus_2 (locus_2), index n_links (n_links),' \
          ' index n_comparisons (n_comparisons), index ratio (ratio))' % biodb

    server.adaptor.execute(sql,)
Ejemplo n.º 9
0
def get_profile_fasta(biodb, taxon_id):
    '''

    - ordered taxon
    - transposed orthology table => each row is a different taxon

    :return:
    '''

    import manipulate_biosqldb
    import pandas
    import numpy
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio import SeqIO

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select taxon_id, accession from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \
          ' where (t1.name="%s" and t2.description not like "%%%%plasmid%%%%")' % biodb

    taxon2accession = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql, ))

    taxon_id_filter = '`' + '`,`'.join(taxon2accession.keys()) + '`'

    sql = 'select t2.locus_tag,%s from comparative_tables.orthology_%s t1 inner join orthology_detail_%s t2 on t1.orthogroup=t2.orthogroup' \
          ' where t2.taxon_id=%s' % (taxon_id_filter, biodb, biodb, taxon_id)
    sql3 = 'show columns from comparative_tables.orthology_%s' % (biodb)

    data = numpy.array(
        [list(i) for i in server.adaptor.execute_and_fetchall(sql, )])

    all_cols = [i[0] for i in server.adaptor.execute_and_fetchall(sql3, )]

    count_df = pandas.DataFrame(data, columns=all_cols)

    count_df = count_df.set_index(['orthogroup'])
    count_df = count_df.apply(pandas.to_numeric, args=('coerce', ))
    count_df[(count_df > 1)] = 1
    #print count_df
    transposed_table = count_df.transpose()
    #print transposed_table
    #transposed_table.columns = []
    all_records = []
    for taxon, row in transposed_table.iterrows():
        #print taxon, row
        profile_dat = [str(i) for i in row]
        profile = ''.join(profile_dat)
        simple_seq = Seq(profile)
        simple_seq_r = SeqRecord(simple_seq)
        simple_seq_r.id = taxon2accession[taxon]
        simple_seq_r.description = ""
        all_records.append(simple_seq_r)
    with open("profiles_all.fasta", 'w') as tt:
        SeqIO.write(all_records, tt, 'fasta')
Ejemplo n.º 10
0
def create_taxon_link_table_locus(biodb):
    # locus_a, linked_locus, ref_ortho, linked_group, taxon_a,t
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'create table interactions.colocalization_taxons_table_locus_%s (locus_1  varchar(200), ' \
          ' locus_2 varchar(200), group_1 varchar(200), group_2 varchar(200), taxon_1 INT,' \
          'taxon_2 INT)' % biodb

    server.adaptor.execute(sql,)
def orthogroup2locus_and_sequences(biodb, orthogroup):

    import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select locus_tag, translation from orthology_detail_%s where orthogroup="%s"' % (
        biodb, orthogroup)

    data = server.adaptor.execute_and_fetchall(sql, )

    return data
Ejemplo n.º 12
0
def accession2dna_seg_genome(accession, biodb):
    import manipulate_biosqldb
    import numpy
    import pandas

    server, db = manipulate_biosqldb.load_db(biodb)
    sql = 'select locus_tag, start, stop, strand from orthology_detail_%s where accession="%s"' % (biodb, accession)

    data = numpy.array([list(i) for i in server.adaptor.execute_and_fetchall(sql,)])
    columns = ['name', 'start', 'end', 'strand']

    df = pandas.DataFrame(data, columns=columns)
    return df
Ejemplo n.º 13
0
def get_multiple_set_profiles(biodb,
                              set_list,
                              column="bitscore",
                              bitscore_cutoff=0,
                              query_coverage_cutoff=0,
                              hit_coverage_cut0ff=0):
    '''

    :param biodb:
    :param set_name:
    :param score: should one of those:  bitscore, bias, evalue, query_coverage
    :return:
    '''
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    set_filter = '"' + '","'.join(set_list) + '"'

    sql = 'select  from blast_sets t1 inner join blast_sets_entry t2 on t1.set_id=t2.set_id inner join blast_hits_annotated_genome_chlamydia_04_16 t3 ' \
          'on t2.hmm_id=t3.hmm_id inner join custom_tables.locus2seqfeature_id_chlamydia_04_16 t4 on t3.seqfeature_id=t4.seqfeature_id where t1.name="T3SS"'

    sql = 'select t3.taxon_id,t4.accession,%s,t1.name from blast.blast_sets t1 ' \
          ' inner join blast.blast_sets_entry t2 on t1.set_id=t2.set_id ' \
          ' inner join blast.blast_hits_annotated_genome_%s t3 on t2.seq_id=t3.seq_id ' \
          ' inner join blast.blast_db t4 on t2.seq_id=t4.seq_id ' \
          ' inner join custom_tables.locus2seqfeature_id_%s t5 on t3.seqfeature_id=t5.seqfeature_id' \
          ' where t1.name in (%s) and bitscore>=%s and query_coverage>=%s and hit_coverage>=%s order by bitscore;' % (column,
                                                                            biodb,
                                                                            biodb,
                                                                            set_filter,
                                                                            bitscore_cutoff,
                                                                            query_coverage_cutoff,
                                                                            hit_coverage_cut0ff)

    gene2taxon2score = {}
    gene_list = []
    data = server.adaptor.execute_and_fetchall(sql, )
    for row in data:
        gene_id = "%s (%s)" % (row[1], row[3])
        if gene_id not in gene_list:
            gene_list.append(gene_id)
        if gene_id not in gene2taxon2score:
            gene2taxon2score[gene_id] = {}
            gene2taxon2score[gene_id][str(row[0])] = row[2]
        else:
            if str(row[0]) not in gene2taxon2score[gene_id]:
                gene2taxon2score[gene_id][str(row[0])] = row[2]
            else:
                print('already present!!!!!!!')
                continue
    return gene2taxon2score, gene_list
Ejemplo n.º 14
0
def biodb2randomized_matrix(bio_db_name):
    server, db = manipulate_biosqldb.load_db(bio_db_name)
    matrix = np.array(
        manipulate_biosqldb.get_orthology_table(server, bio_db_name))

    taxon_id2description = manipulate_biosqldb.taxon_id2genome_description(
        server, bio_db_name)
    #print taxon_id2description

    #group_names = matrix[:,0]
    taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name)

    print 'Number of taxons:', len(taxons_ids)
    taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids]

    import re
    for i, accession in enumerate(taxons_ids):
        #print i, accession
        description = taxons_ids[i]
        description = re.sub(", complete genome\.", "", description)
        description = re.sub(", complete genome", "", description)
        description = re.sub(", complete sequence\.", "", description)
        description = re.sub("strain ", "", description)
        description = re.sub("str\. ", "", description)
        description = re.sub(" complete genome sequence\.", "", description)
        description = re.sub(" complete genome\.", "", description)
        description = re.sub(" chromosome", "", description)
        description = re.sub(" DNA", "S.", description)
        description = re.sub("Merged record from ", "", description)
        description = re.sub(", wgs", "", description)
        description = re.sub("Candidatus ", "", description)
        description = re.sub(".contig.0_1, whole genome shotgun sequence.", "",
                             description)
        description = re.sub("Protochlamydia", "P.", description)
        description = re.sub("Chlamydia", "C.", description)
        description = re.sub("Chlamydophila", "E.", description)
        description = re.sub("Estrella", "E.", description)
        description = re.sub("Rhodopirellula", "R.", description)
        description = re.sub("Methylacidiphilum", "M.", description)
        description = re.sub(" phage", "", description)
        description = re.sub("Parachlamydia", "P.", description)
        description = re.sub("Neochlamydia", "Neo.", description)
        description = re.sub("Simkania", "S.", description)
        description = re.sub("Waddlia", "W.", description)
        description = re.sub("Pirellula", "P.", description)
        description = re.sub("Rhabdochlamydiaceae sp.", "Rhabdo", description)
        taxons_ids[i] = description

    M = matrix.astype(float)  # [:, 1:]
    M = heatmap.randomize_table(M)
    return (M, taxons_ids)
Ejemplo n.º 15
0
def convert_tree_taxon_id2accession(biodb_name,
                                    input_tree,
                                    output_tree,
                                    sqlite=False):
    server, db = manipulate_biosqldb.load_db(biodb_name, sqlite=sqlite)
    taxon_id2accession = manipulate_biosqldb.taxon_id2accession_chromosome(
        server, biodb_name)
    for i in taxon_id2accession:
        taxon_id2accession[str(i)] = taxon_id2accession[i]
    print "taxon_id2accession", taxon_id2accession
    new_tree = parse_newick_tree.convert_terminal_node_names(
        input_tree, taxon_id2accession)

    Phylo.write(new_tree, output_tree, 'newick')
Ejemplo n.º 16
0
def convert_tree_accession2taxon_id(biodb_name,
                                    input_tree,
                                    output_tree,
                                    sqlite=False):
    server, db = manipulate_biosqldb.load_db(biodb_name, sqlite=sqlite)
    accession2taxon_id = manipulate_biosqldb.accession2taxon_id(
        server, biodb_name)
    for i in accession2taxon_id:
        accession2taxon_id[i] = str(accession2taxon_id[i])
    print "accession2taxon_id", accession2taxon_id
    new_tree = parse_newick_tree.convert_terminal_node_names(
        input_tree, accession2taxon_id)

    Phylo.write(new_tree, output_tree, 'newick')
Ejemplo n.º 17
0
def get_genome_seg(accession, biodb):

    import manipulate_biosqldb
    import pandas

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select genome_size from genomes_info_%s where accession="%s";' % (biodb, accession)

    genome_size = server.adaptor.execute_and_fetchall(sql,)[0][0]
    data = [(accession, 0, genome_size, '-')]
    columns = ['name', 'start', 'end', 'strand']

    df = pandas.DataFrame(data, columns=columns)
    return df
Ejemplo n.º 18
0
def insert_taxon_links_into_mysql(biodb,
                      group2taxons):
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)
    for i in group2taxons:
        for x in group2taxons[i]:
            for one_taxon_pair in group2taxons[i][x]:
                    sql = 'insert into interactions.colocalization_taxons_table_%s (group_1, group_2, taxon_1, taxon_2)' \
                          ' values ("%s","%s",%s,%s)' % (biodb,
                                                            i,
                                                            x,
                                                            one_taxon_pair[0],
                                                            one_taxon_pair[1])
                    server.adaptor.execute_and_fetchall(sql,)
    server.adaptor.commit()
Ejemplo n.º 19
0
def insert_locus_links_into_mysql(biodb, locus2links):
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)
    pair_done = []
    for locus_a in locus2links:
        for locus_b in locus2links[locus_a]:
            sql = 'insert into interactions.colocalization_table_locus_%s (locus_1, locus_2, n_links, n_comparisons, ratio)' \
                  ' values ("%s","%s",%s,%s,%s)' % (biodb,
                                                    locus_a,
                                                    locus_b,
                                                    locus2links[locus_a][locus_b][0],
                                                    locus2links[locus_a][locus_b][1],
                                                    locus2links[locus_a][locus_b][0]/float(locus2links[locus_a][locus_b][1]))
            server.adaptor.execute_and_fetchall(sql,)
    server.adaptor.commit()
Ejemplo n.º 20
0
def get_pairwise_connexions(accession_1, accession_2, biodb):
    import manipulate_biosqldb
    import numpy
    import pandas

    server, db = manipulate_biosqldb.load_db(biodb)

    sql1 = 'select seqfeature_id, start, stop from biosqldb.orthology_detail_%s where accession in ("%s","%s") ' % (biodb,
                                                                                                                   accession_1,
                                                                                                                   accession_2)
    seqfeature_id2location = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,))

    print seqfeature_id2location.keys()[0:10]

    sql2 = 'select accession, taxon_id from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \
           ' where t1.name="%s"' % biodb
    print sql2


    accession2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,))

    comp1_sql = 'select locus_1,locus_2,identity from (select * from ' \
                ' comparative_tables.identity_closest_homolog2_%s where taxon_1=%s and taxon_2=%s) A ' \
                ' inner join biosqldb.orthology_detail_%s B on A.locus_1=B.seqfeature_id;' % (biodb,
                                                                                              accession2taxon_id[accession_1],
                                                                                              accession2taxon_id[accession_2],
                                                                                              biodb)


    data = server.adaptor.execute_and_fetchall(comp1_sql,)

    comparison_table = []
    for row in data:
        print row
        try:
            start1 = seqfeature_id2location[int(row[0])][0]
            stop1 = seqfeature_id2location[int(row[0])][0]
            start2 = seqfeature_id2location[int(row[1])][0]
            stop2 = seqfeature_id2location[int(row[1])][0]
            identity = row[2]
            comparison_table.append([start1, stop1, start2, stop2, identity])
        except:
            pass
    data = numpy.array(comparison_table)
    columns = ['start1', 'end1', 'start2', 'end2', 'identity']
    df = pandas.DataFrame(data, columns=columns)
    return df
Ejemplo n.º 21
0
def transporter_family_heatmap(biodb,
                               family_list,
                               evalue,
                               bitscore,
                               query_cov,
                               hit_cov,
                               total=False):
    import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    family_filter = '"' + '","'.join(family_list) + '"'

    sql = 'select t1.taxon_id,t3.description, count(*) as n from transporters.transporters_%s t1 ' \
          ' inner join transporters.transporter_table t2 on t1.transporter_id=t2.transporter_id ' \
          ' inner join transporters.tc_table t3 on t2.family=t3.tc_id ' \
          ' inner join transporters.tc_table t4 on t2.family=t4.tc_id ' \
          ' where query_cov>=%s and hit_cov>=%s and evalue<=%s and bitscore_first_hsp>=%s and t4.tc_name in (%s) ' \
          ' group by taxon_id,t3.tc_id;' % (biodb, query_cov, hit_cov, evalue, bitscore, family_filter)
    print(sql)
    data = server.adaptor.execute_and_fetchall(sql, )

    code2taxon2count = {}
    if total:
        code2taxon2count['TOTAL'] = {}
        transporter_list = ['TOTAL']
    else:
        transporter_list = []
    for row in data:
        row = list(row)
        if total:
            if str(row[0]) not in code2taxon2count['TOTAL']:
                code2taxon2count['TOTAL'][str(row[0])] = int(row[2])
            else:
                code2taxon2count['TOTAL'][str(row[0])] += int(row[2])

        if row[1] not in transporter_list:
            transporter_list.append(row[1])

        if row[1] not in code2taxon2count:
            code2taxon2count[row[1]] = {}

            code2taxon2count[row[1]][str(row[0])] = int(row[2])
        else:
            code2taxon2count[row[1]][str(row[0])] = int(row[2])

    return transporter_list, code2taxon2count
Ejemplo n.º 22
0
def get_single_set_data(biodb,
                        set_name,
                        column="bitscore",
                        bitscore_cutoff=0,
                        query_coverage_cutoff=0):
    '''

    :param biodb:
    :param set_name:
    :param score: should one of those:  bitscore, bias, evalue, query_coverage
    :return:
    '''
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select  from hmm_sets t1 inner join hmm_sets_entry t2 on t1.set_id=t2.set_id ' \
          ' inner join hmm_hits_annotated_genome_chlamydia_04_16 t3 ' \
          ' on t2.hmm_id=t3.hmm_id inner join custom_tables.locus2seqfeature_id_chlamydia_04_16 t4 ' \
          ' on t3.seqfeature_id=t4.seqfeature_id where t1.name="T3SS"'

    sql = 'select t3.taxon_id,t4.name,%s from hmm.hmm_sets t1 ' \
          ' inner join hmm.hmm_sets_entry t2 on t1.set_id=t2.set_id ' \
          ' inner join hmm.hmm_hits_annotated_genome_%s t3 on t2.hmm_id=t3.hmm_id ' \
          ' inner join hmm.hmm_profiles t4 on t2.hmm_id=t4.hmm_id ' \
          ' inner join custom_tables.locus2seqfeature_id_%s t5 on t3.seqfeature_id=t5.seqfeature_id' \
          ' where t1.name="%s" and bitscore>=%s and query_coverage>=%s order by bitscore;' % (column,
                                                                            biodb,
                                                                            biodb,
                                                                            set_name,
                                                                            bitscore_cutoff,
                                                                            query_coverage_cutoff)
    gene2taxon2score = {}
    gene_list = []
    data = server.adaptor.execute_and_fetchall(sql, )
    for row in data:
        if row[1] not in gene_list:
            gene_list.append(row[1])
        if row[1] not in gene2taxon2score:
            gene2taxon2score[row[1]] = {}
            gene2taxon2score[row[1]][str(row[0])] = row[2]
        else:
            if str(row[0]) not in gene2taxon2score[row[1]]:
                gene2taxon2score[row[1]][str(row[0])] = row[2]
            else:
                print('already present!!!!!!!')
                continue
    return gene2taxon2score, gene_list
Ejemplo n.º 23
0
def insert_taxon_links_into_mysql_locus(biodb, locus2taxons):
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)
    for locus_1 in locus2taxons:
        for locus_2 in locus2taxons[locus_1]:
            for one_taxon_pair in locus2taxons[locus_1][locus_2]:
                    sql = 'insert into interactions.colocalization_taxons_table_locus_%s (locus_1, locus_2, group_1, ' \
                          ' group_2, taxon_1, taxon_2)' \
                          ' values ("%s","%s","%s","%s", %s, %s)' % (biodb,
                                                            locus_1,
                                                            locus_2,
                                                            one_taxon_pair[0],
                                                            one_taxon_pair[1],
                                                            one_taxon_pair[2],
                                                            one_taxon_pair[3])
                    server.adaptor.execute_and_fetchall(sql,)
    server.adaptor.commit()
Ejemplo n.º 24
0
def convert_tree_taxon2genome(biodb_name,
                              input_tree,
                              output_tree,
                              sqlite=False):
    server, db = manipulate_biosqldb.load_db(biodb_name, sqlite=sqlite)
    print biodb_name
    taxon_id2genome_description = manipulate_biosqldb.taxon_id2genome_description(
        server, biodb_name)

    print taxon_id2genome_description

    #locus2genome = manipulate_biosqldb.locus_tag2genome_name(server, biodb_name)

    import re
    for i in taxon_id2genome_description.keys():
        print i
        taxon_id2genome_description[i] = re.sub(" subsp\. aureus", "",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub(", complete genome\.", "",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub(", complete sequence\.", "",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub("strain ", "",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub("str\. ", "",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub(" complete genome sequence\.",
                                                "",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub(" complete genome\.", "",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub(" chromosome", "",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub("Staphylococcus", "S.",
                                                taxon_id2genome_description[i])
        taxon_id2genome_description[i] = re.sub(" DNA", "S.",
                                                taxon_id2genome_description[i])
    #print taxon_id2genome_description[i]

    print taxon_id2genome_description
    new_tree = parse_newick_tree.convert_terminal_node_names(
        input_tree, taxon_id2genome_description)
    #print new_tree[0]
    print "writing converted tree..."
    print output_tree
    Phylo.write(new_tree, output_tree, 'newick')
Ejemplo n.º 25
0
def get_multiple_set_counts(biodb,
                            set_list,
                            bitscore_cutoff=0,
                            query_coverage_cutoff=0,
                            hit_coverage_cut0ff=0):
    '''

    :param biodb:
    :param set_name:
    :param score: should one of those:  bitscore, bias, evalue, query_coverage
    :return:
    '''
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    set_filter = '"' + '","'.join(set_list) + '"'

    sql = 'select  from blast_sets t1 inner join blast_sets_entry t2 on t1.set_id=t2.set_id inner join blast_hits_annotated_genome_chlamydia_04_16 t3 ' \
          'on t2.hmm_id=t3.hmm_id inner join custom_tables.locus2seqfeature_id_chlamydia_04_16 t4 on t3.seqfeature_id=t4.seqfeature_id where t1.name="T3SS"'

    sql = 'select taxon_id,A.name, count(*) from (select t1.*,t4.* from blast.blast_sets t1 inner join blast.blast_sets_entry t2 ' \
          ' on t1.set_id=t2.set_id inner join blast.blast_db t3 on t2.seq_id=t3.seq_id ' \
          ' inner join blast.blast_hits_annotated_genome_%s t4 on t3.seq_id=t4.seq_id ' \
          ' where t1.name in (%s) and bitscore>=%s and query_coverage>=%s and hit_coverage>=%s ' \
          ' group by taxon_id,name,seq_id) A group by taxon_id,A.name;' % (biodb,
                                                                      set_filter,
                                                                      bitscore_cutoff,
                                                                      query_coverage_cutoff,
                                                                      hit_coverage_cut0ff)
    print(sql)
    set2taxon2count = {}
    # taxon_id, set_name, count
    data = server.adaptor.execute_and_fetchall(sql, )
    for row in data:
        taxon_id, set_name, count = row
        if set_name not in set2taxon2count:
            set2taxon2count[set_name] = {}
            set2taxon2count[set_name][str(taxon_id)] = int(count)
        else:
            if str(row[0]) not in set2taxon2count[set_name]:
                set2taxon2count[set_name][str(taxon_id)] = int(count)
            else:
                print('already present!!!!!!!')
                continue
    return set2taxon2count
Ejemplo n.º 26
0
def get_annotation_file(biodb, taxon_id):
    '''
    Pos	ID	Description
    1	G0001	Module A subunit 1
    2	G0002	Module A subunit 2

    :return:
    '''
    import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select t2.locus_tag,t2.product from comparative_tables.orthology_%s t1 inner join orthology_detail_%s t2 on t1.orthogroup=t2.orthogroup' \
          ' where t2.taxon_id=%s' % (biodb, biodb, taxon_id)
    data = server.adaptor.execute_and_fetchall(sql, )
    print 'Pos\tID\tDescription'
    for i, row in enumerate(data):
        print "%s\t%s\t%s" % (i + 1, row[0], row[1])
Ejemplo n.º 27
0
def insert_group_links_into_mysql(biodb,
                      group2links):
    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)
    pair_done = []
    for i in group2links:
        for x in group2links[i]:
            if [i, x] in pair_done:
                continue
            else:
                pair_done.append([i, x])
                pair_done.append([x, i])
                sql = 'insert into interactions.colocalization_table_%s (group_1, group_2, n_links, n_comparisons, ratio)' \
                      ' values ("%s","%s",%s,%s,%s)' % (biodb,
                                                        i,
                                                        x,
                                                        group2links[i][x][0],
                                                        group2links[i][x][1],
                                                        group2links[i][x][0]/float(group2links[i][x][1]))
                server.adaptor.execute_and_fetchall(sql,)
    server.adaptor.commit()
Ejemplo n.º 28
0
def transporter_superfamily_heatmap(biodb, family, evalue, bitscore, query_cov,
                                    hit_cov):
    import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    sql = 'select t1.taxon_id,t3.description, count(*) as n, t3.tc_name from transporters.transporters_%s t1 ' \
          ' inner join transporters.transporter_table t2 on t1.transporter_id=t2.transporter_id ' \
          ' inner join transporters.tc_table t3 on t2.family=t3.tc_id ' \
          ' inner join transporters.tc_table t4 on t2.superfamily=t4.tc_id ' \
          ' where query_cov>%s and hit_cov>%s and evalue<=%s and bitscore_first_hsp>=%s and t4.description="%s" ' \
          ' group by taxon_id,t3.tc_id;' % (biodb, query_cov, hit_cov, evalue, bitscore, family)
    print(sql)
    data = server.adaptor.execute_and_fetchall(sql, )

    code2taxon2count = {}
    code2taxon2count['TOTAL'] = {}
    transporter_list = ['TOTAL']
    for row in data:
        if row[3] not in row[1]:
            family_label = "%s: %s" % (row[3], row[1])
        else:
            family_label = row[1]
        row = list(row)
        if str(row[0]) not in code2taxon2count['TOTAL']:
            code2taxon2count['TOTAL'][str(row[0])] = int(row[2])
        else:
            code2taxon2count['TOTAL'][str(row[0])] += int(row[2])

        if family_label not in transporter_list:
            transporter_list.append(family_label)

        if family_label not in code2taxon2count:
            code2taxon2count[family_label] = {}

            code2taxon2count[family_label][str(row[0])] = int(row[2])
        else:
            code2taxon2count[family_label][str(row[0])] = int(row[2])

    return transporter_list, code2taxon2count
Ejemplo n.º 29
0
def write_ortho_matrix(bio_db_name):
    server, db = manipulate_biosqldb.load_db(bio_db_name)
    matrix = np.array(
        manipulate_biosqldb.get_orthology_table(server, bio_db_name))

    taxon_id2description = manipulate_biosqldb.taxon_id2genome_description(
        server, bio_db_name)

    group_names = matrix[:, 0]
    taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name)

    import re
    for i in taxon_id2description.keys():
        taxon_id2description[i] = re.sub(" subsp\. aureus", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(", complete genome\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(", complete sequence\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub("strain ", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i])
        taxon_id2description[i] = re.sub(" complete genome sequence\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(" complete genome\.", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(" chromosome", "",
                                         taxon_id2description[i])
        taxon_id2description[i] = re.sub(" DNA", "", taxon_id2description[i])

    taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids]
    #print taxon_id2description
    f = open("ortho_matrix.tab", "w")

    f.write("orthogroup\t" + "\t".join(taxons_ids) + "\n")

    for row in matrix:
        f.write("\t".join(row) + "\n")
    f.close()
def get_module_count_all_db(biodb, category=False):
    '''

    :param biodb: <biodatabase name>
    :param category: KEGG module category (optional)
    :return: for each module, return the total count from KEGG, and the total count of KO present in the <biodb>
    '''

    import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    sql_biodb_id = 'select biodatabase_id from biodatabase where name="%s"' % biodb

    database_id = server.adaptor.execute_and_fetchall(sql_biodb_id, )[0][0]

    if category:

        sql_pathway_count = 'select BB.module_name,count_all,count_db,count_db/count_all from (select module_id, count(*) ' \
                            ' as count_db from (select distinct ko_id from enzyme.locus2ko_%s) as t1' \
                            ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by module_id) AA ' \
                            ' right join (select t1.module_id,module_name, count_all from (select module_id, count(*) as count_all ' \
                            'from enzyme.module2ko group by module_id) t1 inner join enzyme.kegg_module as t2 ' \
                            'on t1.module_id=t2.module_id where module_sub_cat="%s")BB on AA.module_id=BB.module_id;' % (biodb, category) # where pathway_category!="1.0 Global and overview maps"
    else:
        # select distinct KO
        # join with module
        sql_pathway_count = 'select BB.module_name,count_all,count_db,count_db/count_all from (select module_id, count(*) ' \
                            ' as count_db from (select distinct ko_id from enzyme.locus2ko_%s) as t1' \
                            ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by module_id) AA ' \
                            ' right join (select t1.module_id,module_name, count_all from (select module_id, count(*) as count_all ' \
                            'from enzyme.module2ko group by module_id) t1 inner join enzyme.kegg_module as t2 ' \
                            'on t1.module_id=t2.module_id)BB on AA.module_id=BB.module_id;' % (biodb) # where pathway_category!="1.0 Global and overview maps"

    map2count = manipulate_biosqldb.to_dict(
        server.adaptor.execute_and_fetchall(sql_pathway_count, ))
    return map2count