def vcf2heatmap(vcf_file, biodb="saureus_01_15", output="heat_ksnp_SaC.pdf"): my_array, sample_names = parse_vcf(vcf_file) server, db = manipulate_biosqldb.load_db(biodb) accession2description = manipulate_biosqldb.accession2description( server, biodb) for i in range(0, len(sample_names)): try: sample_names[i] = accession2description[sample_names[i]] except: pass M = my_array[:, 0:] #print M[0:10,:] # tri en fonction de la localisation #M = M[M[:, 0].argsort()] core = 0 core_locations = [] non_core = 0 for index, value in enumerate(M[:, 1:].sum(axis=1)): if value > 50: core += 1 core_locations.append(int(M[index, 0])) else: non_core += 1 print "core", core, len(core_locations) print "non core", non_core print core / float((core + non_core)) #M = heatmap.randomize_table(M[:,1:]) #heatmap.heatmap_ksnp(M, format='png', output=output, breaks="-0.5, 0.5, 5, 99", rows=None, columns = sample_names,orderRows = False) return core_locations
def taxon2module2count(biodb, category=False): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) if category: # C.pathway_category,taxon_id, A.pathway_name,A.n_enzymes, C.description # select distinct KO id # join with module to get counts/modules sql = 'select B.module_sub_cat,A.taxon_id,B.module_name,A.n,B.description from ' \ ' (select taxon_id, module_id, count(*) as n from ' \ ' (select distinct taxon_id,ko_id from enzyme.locus2ko_%s) t1 ' \ ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by taxon_id, module_id) A ' \ ' inner join enzyme.kegg_module as B on A.module_id=B.module_id where module_sub_cat="%s";' % (biodb, category) else: sql = 'select B.module_sub_cat,A.taxon_id,B.module_name,A.n,B.description from ' \ ' (select taxon_id, module_id, count(*) as n from ' \ ' (select distinct taxon_id,ko_id from enzyme.locus2ko_%s) t1 ' \ ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by taxon_id, module_id) A ' \ ' inner join enzyme.kegg_module as B on A.module_id=B.module_id;' % (biodb) pathway_data = server.adaptor.execute_and_fetchall(sql, ) taxon2module2c = {} # 0 sub cat # 1 taxon_id # 2 module name # 3 count # 4 description for row in pathway_data: if row[1] not in taxon2module2c: taxon2module2c[row[1]] = {} taxon2module2c[row[1]][row[2]] = row[3] else: taxon2module2c[row[1]][row[2]] = row[3] return taxon2module2c
def pathway_list2profile_dico(biodb, pathway_list, taxon_id_list=[], group_by_KO=True): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb db_id = server.adaptor.execute_and_fetchall(sql, )[0][0] filter_2 = '"' + '","'.join(pathway_list) + '"' # RESTRICT TO AS SUBSET OF THE TAXON AVAILABLE if len(taxon_id_list) > 0: filter = ','.join(taxon_id_list) sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,description,t3.pathway_id,t1.ko_id ' \ ' from enzyme.locus2ko_%s t1 inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id ' \ ' inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id ' \ ' where t3.pathway_name in (%s) and taxon_id in (%s)) A group by taxon_id,pathway_id;' % (biodb, filter_2, filter) print(sql) else: if not group_by_KO: print('not grouping') sql = 'select taxon_id,description,count(*) from (select distinct taxon_id,description,t3.pathway_id,t1.ko_id ' \ ' from enzyme.locus2ko_%s t1 inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id ' \ ' inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id ' \ ' where t3.pathway_name in (%s)) A group by taxon_id,pathway_id;' % (biodb, filter_2) else: print('grouping') sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,description,t4.ko_accession ' \ ' from enzyme.seqfeature_id2ko_%s t1 ' \ ' inner join annotation.seqfeature_id2locus_%s tb on t1.seqfeature_id=tb.seqfeature_id ' \ ' inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id' \ ' inner join enzyme.ko_annotation t4 on t1.ko_id=t4.ko_id where pathway_name in (%s) ) A ' \ ' group by A.taxon_id, A.description;;' % (biodb, biodb, filter_2) print(sql) #print sql data = server.adaptor.execute_and_fetchall(sql, ) code2taxon2count = {} pathway_list = [] for row in data: row = list(row) #print row if row[1] not in pathway_list: pathway_list.append(row[1]) if row[1] not in code2taxon2count: code2taxon2count[row[1]] = {} code2taxon2count[row[1]][str(row[0])] = int(row[2]) else: code2taxon2count[row[1]][str(row[0])] = int(row[2]) return pathway_list, code2taxon2count
def get_annotation(accession, biodb): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) locus2COG = {} locus2KEGG = {} sql1 = 'select locus_tag,t1.COG_id,functon,name from COG_locus_tag2gi_hit t1 ' \ ' inner join COG_cog_names_2014 t2 on t1.COG_id=t2.COG_id where accession="%s";' % (biodb, accession) for row in server.adaptor.execute_and_fetchall(sql1, ): if row[0] not in locus2COG: locus2COG[row[0]] = [row[1:]] else: locus2COG[row[0]].append(row[1:]) sql2 = 'select t1.locus_tag, t3.ko_id,t6.definition,t5.pathway_name,t5.description from orthology_detail t1 ' \ ' inner join enzyme_locus2ko t3 on t1.locus_tag=t3.locus_tag ' \ ' inner join enzyme_pathway2ko t4 on t3.ko_id=t4.ko_id ' \ ' inner join enzyme_kegg_pathway t5 on t4.pathway_id=t5.pathway_id inner join enzyme_ko_annotation t6 on t3.ko_id=t6.ko_id' \ ' where t1.accession="%s";' % (biodb, biodb, accession) for row in server.adaptor.execute_and_fetchall(sql2, ): if row[0] not in locus2KEGG: locus2KEGG[row[0]] = [row[1:]] else: locus2KEGG[row[0]].append(row[1:]) return locus2COG, locus2KEGG
def create_taxon_link_table(biodb): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table interactions.colocalization_taxons_table_%s (group_1 varchar(200), group_2 varchar(200), taxon_1 INT,' \ 'taxon_2 INT)' % biodb server.adaptor.execute(sql,)
def create_group_link_table(biodb): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table interactions.colocalization_table_%s (group_1 varchar(200), group_2 varchar(200), n_links INT,' \ 'n_comparisons INT, ratio float)' % biodb server.adaptor.execute(sql,)
def get_phylogeny(biodb): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql_tree = 'select tree from reference_phylogeny as t1 inner join biodatabase as t2 on t1.biodatabase_id=t2.biodatabase_id where name="%s";' % biodb tree = server.adaptor.execute_and_fetchall(sql_tree)[0][0] return tree
def create_locus_link_table(biodb): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table interactions.colocalization_table_locus_%s (locus_1 varchar(200), locus_2 varchar(200), n_links INT,' \ 'n_comparisons INT, ratio float, index locus_1 (locus_1), index locus_2 (locus_2), index n_links (n_links),' \ ' index n_comparisons (n_comparisons), index ratio (ratio))' % biodb server.adaptor.execute(sql,)
def get_profile_fasta(biodb, taxon_id): ''' - ordered taxon - transposed orthology table => each row is a different taxon :return: ''' import manipulate_biosqldb import pandas import numpy from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO server, db = manipulate_biosqldb.load_db(biodb) sql = 'select taxon_id, accession from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' where (t1.name="%s" and t2.description not like "%%%%plasmid%%%%")' % biodb taxon2accession = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) taxon_id_filter = '`' + '`,`'.join(taxon2accession.keys()) + '`' sql = 'select t2.locus_tag,%s from comparative_tables.orthology_%s t1 inner join orthology_detail_%s t2 on t1.orthogroup=t2.orthogroup' \ ' where t2.taxon_id=%s' % (taxon_id_filter, biodb, biodb, taxon_id) sql3 = 'show columns from comparative_tables.orthology_%s' % (biodb) data = numpy.array( [list(i) for i in server.adaptor.execute_and_fetchall(sql, )]) all_cols = [i[0] for i in server.adaptor.execute_and_fetchall(sql3, )] count_df = pandas.DataFrame(data, columns=all_cols) count_df = count_df.set_index(['orthogroup']) count_df = count_df.apply(pandas.to_numeric, args=('coerce', )) count_df[(count_df > 1)] = 1 #print count_df transposed_table = count_df.transpose() #print transposed_table #transposed_table.columns = [] all_records = [] for taxon, row in transposed_table.iterrows(): #print taxon, row profile_dat = [str(i) for i in row] profile = ''.join(profile_dat) simple_seq = Seq(profile) simple_seq_r = SeqRecord(simple_seq) simple_seq_r.id = taxon2accession[taxon] simple_seq_r.description = "" all_records.append(simple_seq_r) with open("profiles_all.fasta", 'w') as tt: SeqIO.write(all_records, tt, 'fasta')
def create_taxon_link_table_locus(biodb): # locus_a, linked_locus, ref_ortho, linked_group, taxon_a,t import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table interactions.colocalization_taxons_table_locus_%s (locus_1 varchar(200), ' \ ' locus_2 varchar(200), group_1 varchar(200), group_2 varchar(200), taxon_1 INT,' \ 'taxon_2 INT)' % biodb server.adaptor.execute(sql,)
def orthogroup2locus_and_sequences(biodb, orthogroup): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select locus_tag, translation from orthology_detail_%s where orthogroup="%s"' % ( biodb, orthogroup) data = server.adaptor.execute_and_fetchall(sql, ) return data
def accession2dna_seg_genome(accession, biodb): import manipulate_biosqldb import numpy import pandas server, db = manipulate_biosqldb.load_db(biodb) sql = 'select locus_tag, start, stop, strand from orthology_detail_%s where accession="%s"' % (biodb, accession) data = numpy.array([list(i) for i in server.adaptor.execute_and_fetchall(sql,)]) columns = ['name', 'start', 'end', 'strand'] df = pandas.DataFrame(data, columns=columns) return df
def get_multiple_set_profiles(biodb, set_list, column="bitscore", bitscore_cutoff=0, query_coverage_cutoff=0, hit_coverage_cut0ff=0): ''' :param biodb: :param set_name: :param score: should one of those: bitscore, bias, evalue, query_coverage :return: ''' import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) set_filter = '"' + '","'.join(set_list) + '"' sql = 'select from blast_sets t1 inner join blast_sets_entry t2 on t1.set_id=t2.set_id inner join blast_hits_annotated_genome_chlamydia_04_16 t3 ' \ 'on t2.hmm_id=t3.hmm_id inner join custom_tables.locus2seqfeature_id_chlamydia_04_16 t4 on t3.seqfeature_id=t4.seqfeature_id where t1.name="T3SS"' sql = 'select t3.taxon_id,t4.accession,%s,t1.name from blast.blast_sets t1 ' \ ' inner join blast.blast_sets_entry t2 on t1.set_id=t2.set_id ' \ ' inner join blast.blast_hits_annotated_genome_%s t3 on t2.seq_id=t3.seq_id ' \ ' inner join blast.blast_db t4 on t2.seq_id=t4.seq_id ' \ ' inner join custom_tables.locus2seqfeature_id_%s t5 on t3.seqfeature_id=t5.seqfeature_id' \ ' where t1.name in (%s) and bitscore>=%s and query_coverage>=%s and hit_coverage>=%s order by bitscore;' % (column, biodb, biodb, set_filter, bitscore_cutoff, query_coverage_cutoff, hit_coverage_cut0ff) gene2taxon2score = {} gene_list = [] data = server.adaptor.execute_and_fetchall(sql, ) for row in data: gene_id = "%s (%s)" % (row[1], row[3]) if gene_id not in gene_list: gene_list.append(gene_id) if gene_id not in gene2taxon2score: gene2taxon2score[gene_id] = {} gene2taxon2score[gene_id][str(row[0])] = row[2] else: if str(row[0]) not in gene2taxon2score[gene_id]: gene2taxon2score[gene_id][str(row[0])] = row[2] else: print('already present!!!!!!!') continue return gene2taxon2score, gene_list
def biodb2randomized_matrix(bio_db_name): server, db = manipulate_biosqldb.load_db(bio_db_name) matrix = np.array( manipulate_biosqldb.get_orthology_table(server, bio_db_name)) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, bio_db_name) #print taxon_id2description #group_names = matrix[:,0] taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name) print 'Number of taxons:', len(taxons_ids) taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids] import re for i, accession in enumerate(taxons_ids): #print i, accession description = taxons_ids[i] description = re.sub(", complete genome\.", "", description) description = re.sub(", complete genome", "", description) description = re.sub(", complete sequence\.", "", description) description = re.sub("strain ", "", description) description = re.sub("str\. ", "", description) description = re.sub(" complete genome sequence\.", "", description) description = re.sub(" complete genome\.", "", description) description = re.sub(" chromosome", "", description) description = re.sub(" DNA", "S.", description) description = re.sub("Merged record from ", "", description) description = re.sub(", wgs", "", description) description = re.sub("Candidatus ", "", description) description = re.sub(".contig.0_1, whole genome shotgun sequence.", "", description) description = re.sub("Protochlamydia", "P.", description) description = re.sub("Chlamydia", "C.", description) description = re.sub("Chlamydophila", "E.", description) description = re.sub("Estrella", "E.", description) description = re.sub("Rhodopirellula", "R.", description) description = re.sub("Methylacidiphilum", "M.", description) description = re.sub(" phage", "", description) description = re.sub("Parachlamydia", "P.", description) description = re.sub("Neochlamydia", "Neo.", description) description = re.sub("Simkania", "S.", description) description = re.sub("Waddlia", "W.", description) description = re.sub("Pirellula", "P.", description) description = re.sub("Rhabdochlamydiaceae sp.", "Rhabdo", description) taxons_ids[i] = description M = matrix.astype(float) # [:, 1:] M = heatmap.randomize_table(M) return (M, taxons_ids)
def convert_tree_taxon_id2accession(biodb_name, input_tree, output_tree, sqlite=False): server, db = manipulate_biosqldb.load_db(biodb_name, sqlite=sqlite) taxon_id2accession = manipulate_biosqldb.taxon_id2accession_chromosome( server, biodb_name) for i in taxon_id2accession: taxon_id2accession[str(i)] = taxon_id2accession[i] print "taxon_id2accession", taxon_id2accession new_tree = parse_newick_tree.convert_terminal_node_names( input_tree, taxon_id2accession) Phylo.write(new_tree, output_tree, 'newick')
def convert_tree_accession2taxon_id(biodb_name, input_tree, output_tree, sqlite=False): server, db = manipulate_biosqldb.load_db(biodb_name, sqlite=sqlite) accession2taxon_id = manipulate_biosqldb.accession2taxon_id( server, biodb_name) for i in accession2taxon_id: accession2taxon_id[i] = str(accession2taxon_id[i]) print "accession2taxon_id", accession2taxon_id new_tree = parse_newick_tree.convert_terminal_node_names( input_tree, accession2taxon_id) Phylo.write(new_tree, output_tree, 'newick')
def get_genome_seg(accession, biodb): import manipulate_biosqldb import pandas server, db = manipulate_biosqldb.load_db(biodb) sql = 'select genome_size from genomes_info_%s where accession="%s";' % (biodb, accession) genome_size = server.adaptor.execute_and_fetchall(sql,)[0][0] data = [(accession, 0, genome_size, '-')] columns = ['name', 'start', 'end', 'strand'] df = pandas.DataFrame(data, columns=columns) return df
def insert_taxon_links_into_mysql(biodb, group2taxons): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) for i in group2taxons: for x in group2taxons[i]: for one_taxon_pair in group2taxons[i][x]: sql = 'insert into interactions.colocalization_taxons_table_%s (group_1, group_2, taxon_1, taxon_2)' \ ' values ("%s","%s",%s,%s)' % (biodb, i, x, one_taxon_pair[0], one_taxon_pair[1]) server.adaptor.execute_and_fetchall(sql,) server.adaptor.commit()
def insert_locus_links_into_mysql(biodb, locus2links): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) pair_done = [] for locus_a in locus2links: for locus_b in locus2links[locus_a]: sql = 'insert into interactions.colocalization_table_locus_%s (locus_1, locus_2, n_links, n_comparisons, ratio)' \ ' values ("%s","%s",%s,%s,%s)' % (biodb, locus_a, locus_b, locus2links[locus_a][locus_b][0], locus2links[locus_a][locus_b][1], locus2links[locus_a][locus_b][0]/float(locus2links[locus_a][locus_b][1])) server.adaptor.execute_and_fetchall(sql,) server.adaptor.commit()
def get_pairwise_connexions(accession_1, accession_2, biodb): import manipulate_biosqldb import numpy import pandas server, db = manipulate_biosqldb.load_db(biodb) sql1 = 'select seqfeature_id, start, stop from biosqldb.orthology_detail_%s where accession in ("%s","%s") ' % (biodb, accession_1, accession_2) seqfeature_id2location = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,)) print seqfeature_id2location.keys()[0:10] sql2 = 'select accession, taxon_id from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' where t1.name="%s"' % biodb print sql2 accession2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,)) comp1_sql = 'select locus_1,locus_2,identity from (select * from ' \ ' comparative_tables.identity_closest_homolog2_%s where taxon_1=%s and taxon_2=%s) A ' \ ' inner join biosqldb.orthology_detail_%s B on A.locus_1=B.seqfeature_id;' % (biodb, accession2taxon_id[accession_1], accession2taxon_id[accession_2], biodb) data = server.adaptor.execute_and_fetchall(comp1_sql,) comparison_table = [] for row in data: print row try: start1 = seqfeature_id2location[int(row[0])][0] stop1 = seqfeature_id2location[int(row[0])][0] start2 = seqfeature_id2location[int(row[1])][0] stop2 = seqfeature_id2location[int(row[1])][0] identity = row[2] comparison_table.append([start1, stop1, start2, stop2, identity]) except: pass data = numpy.array(comparison_table) columns = ['start1', 'end1', 'start2', 'end2', 'identity'] df = pandas.DataFrame(data, columns=columns) return df
def transporter_family_heatmap(biodb, family_list, evalue, bitscore, query_cov, hit_cov, total=False): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) family_filter = '"' + '","'.join(family_list) + '"' sql = 'select t1.taxon_id,t3.description, count(*) as n from transporters.transporters_%s t1 ' \ ' inner join transporters.transporter_table t2 on t1.transporter_id=t2.transporter_id ' \ ' inner join transporters.tc_table t3 on t2.family=t3.tc_id ' \ ' inner join transporters.tc_table t4 on t2.family=t4.tc_id ' \ ' where query_cov>=%s and hit_cov>=%s and evalue<=%s and bitscore_first_hsp>=%s and t4.tc_name in (%s) ' \ ' group by taxon_id,t3.tc_id;' % (biodb, query_cov, hit_cov, evalue, bitscore, family_filter) print(sql) data = server.adaptor.execute_and_fetchall(sql, ) code2taxon2count = {} if total: code2taxon2count['TOTAL'] = {} transporter_list = ['TOTAL'] else: transporter_list = [] for row in data: row = list(row) if total: if str(row[0]) not in code2taxon2count['TOTAL']: code2taxon2count['TOTAL'][str(row[0])] = int(row[2]) else: code2taxon2count['TOTAL'][str(row[0])] += int(row[2]) if row[1] not in transporter_list: transporter_list.append(row[1]) if row[1] not in code2taxon2count: code2taxon2count[row[1]] = {} code2taxon2count[row[1]][str(row[0])] = int(row[2]) else: code2taxon2count[row[1]][str(row[0])] = int(row[2]) return transporter_list, code2taxon2count
def get_single_set_data(biodb, set_name, column="bitscore", bitscore_cutoff=0, query_coverage_cutoff=0): ''' :param biodb: :param set_name: :param score: should one of those: bitscore, bias, evalue, query_coverage :return: ''' import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select from hmm_sets t1 inner join hmm_sets_entry t2 on t1.set_id=t2.set_id ' \ ' inner join hmm_hits_annotated_genome_chlamydia_04_16 t3 ' \ ' on t2.hmm_id=t3.hmm_id inner join custom_tables.locus2seqfeature_id_chlamydia_04_16 t4 ' \ ' on t3.seqfeature_id=t4.seqfeature_id where t1.name="T3SS"' sql = 'select t3.taxon_id,t4.name,%s from hmm.hmm_sets t1 ' \ ' inner join hmm.hmm_sets_entry t2 on t1.set_id=t2.set_id ' \ ' inner join hmm.hmm_hits_annotated_genome_%s t3 on t2.hmm_id=t3.hmm_id ' \ ' inner join hmm.hmm_profiles t4 on t2.hmm_id=t4.hmm_id ' \ ' inner join custom_tables.locus2seqfeature_id_%s t5 on t3.seqfeature_id=t5.seqfeature_id' \ ' where t1.name="%s" and bitscore>=%s and query_coverage>=%s order by bitscore;' % (column, biodb, biodb, set_name, bitscore_cutoff, query_coverage_cutoff) gene2taxon2score = {} gene_list = [] data = server.adaptor.execute_and_fetchall(sql, ) for row in data: if row[1] not in gene_list: gene_list.append(row[1]) if row[1] not in gene2taxon2score: gene2taxon2score[row[1]] = {} gene2taxon2score[row[1]][str(row[0])] = row[2] else: if str(row[0]) not in gene2taxon2score[row[1]]: gene2taxon2score[row[1]][str(row[0])] = row[2] else: print('already present!!!!!!!') continue return gene2taxon2score, gene_list
def insert_taxon_links_into_mysql_locus(biodb, locus2taxons): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) for locus_1 in locus2taxons: for locus_2 in locus2taxons[locus_1]: for one_taxon_pair in locus2taxons[locus_1][locus_2]: sql = 'insert into interactions.colocalization_taxons_table_locus_%s (locus_1, locus_2, group_1, ' \ ' group_2, taxon_1, taxon_2)' \ ' values ("%s","%s","%s","%s", %s, %s)' % (biodb, locus_1, locus_2, one_taxon_pair[0], one_taxon_pair[1], one_taxon_pair[2], one_taxon_pair[3]) server.adaptor.execute_and_fetchall(sql,) server.adaptor.commit()
def convert_tree_taxon2genome(biodb_name, input_tree, output_tree, sqlite=False): server, db = manipulate_biosqldb.load_db(biodb_name, sqlite=sqlite) print biodb_name taxon_id2genome_description = manipulate_biosqldb.taxon_id2genome_description( server, biodb_name) print taxon_id2genome_description #locus2genome = manipulate_biosqldb.locus_tag2genome_name(server, biodb_name) import re for i in taxon_id2genome_description.keys(): print i taxon_id2genome_description[i] = re.sub(" subsp\. aureus", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(", complete genome\.", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(", complete sequence\.", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub("strain ", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub("str\. ", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(" complete genome sequence\.", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(" complete genome\.", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(" chromosome", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub("Staphylococcus", "S.", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(" DNA", "S.", taxon_id2genome_description[i]) #print taxon_id2genome_description[i] print taxon_id2genome_description new_tree = parse_newick_tree.convert_terminal_node_names( input_tree, taxon_id2genome_description) #print new_tree[0] print "writing converted tree..." print output_tree Phylo.write(new_tree, output_tree, 'newick')
def get_multiple_set_counts(biodb, set_list, bitscore_cutoff=0, query_coverage_cutoff=0, hit_coverage_cut0ff=0): ''' :param biodb: :param set_name: :param score: should one of those: bitscore, bias, evalue, query_coverage :return: ''' import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) set_filter = '"' + '","'.join(set_list) + '"' sql = 'select from blast_sets t1 inner join blast_sets_entry t2 on t1.set_id=t2.set_id inner join blast_hits_annotated_genome_chlamydia_04_16 t3 ' \ 'on t2.hmm_id=t3.hmm_id inner join custom_tables.locus2seqfeature_id_chlamydia_04_16 t4 on t3.seqfeature_id=t4.seqfeature_id where t1.name="T3SS"' sql = 'select taxon_id,A.name, count(*) from (select t1.*,t4.* from blast.blast_sets t1 inner join blast.blast_sets_entry t2 ' \ ' on t1.set_id=t2.set_id inner join blast.blast_db t3 on t2.seq_id=t3.seq_id ' \ ' inner join blast.blast_hits_annotated_genome_%s t4 on t3.seq_id=t4.seq_id ' \ ' where t1.name in (%s) and bitscore>=%s and query_coverage>=%s and hit_coverage>=%s ' \ ' group by taxon_id,name,seq_id) A group by taxon_id,A.name;' % (biodb, set_filter, bitscore_cutoff, query_coverage_cutoff, hit_coverage_cut0ff) print(sql) set2taxon2count = {} # taxon_id, set_name, count data = server.adaptor.execute_and_fetchall(sql, ) for row in data: taxon_id, set_name, count = row if set_name not in set2taxon2count: set2taxon2count[set_name] = {} set2taxon2count[set_name][str(taxon_id)] = int(count) else: if str(row[0]) not in set2taxon2count[set_name]: set2taxon2count[set_name][str(taxon_id)] = int(count) else: print('already present!!!!!!!') continue return set2taxon2count
def get_annotation_file(biodb, taxon_id): ''' Pos ID Description 1 G0001 Module A subunit 1 2 G0002 Module A subunit 2 :return: ''' import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select t2.locus_tag,t2.product from comparative_tables.orthology_%s t1 inner join orthology_detail_%s t2 on t1.orthogroup=t2.orthogroup' \ ' where t2.taxon_id=%s' % (biodb, biodb, taxon_id) data = server.adaptor.execute_and_fetchall(sql, ) print 'Pos\tID\tDescription' for i, row in enumerate(data): print "%s\t%s\t%s" % (i + 1, row[0], row[1])
def insert_group_links_into_mysql(biodb, group2links): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) pair_done = [] for i in group2links: for x in group2links[i]: if [i, x] in pair_done: continue else: pair_done.append([i, x]) pair_done.append([x, i]) sql = 'insert into interactions.colocalization_table_%s (group_1, group_2, n_links, n_comparisons, ratio)' \ ' values ("%s","%s",%s,%s,%s)' % (biodb, i, x, group2links[i][x][0], group2links[i][x][1], group2links[i][x][0]/float(group2links[i][x][1])) server.adaptor.execute_and_fetchall(sql,) server.adaptor.commit()
def transporter_superfamily_heatmap(biodb, family, evalue, bitscore, query_cov, hit_cov): import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select t1.taxon_id,t3.description, count(*) as n, t3.tc_name from transporters.transporters_%s t1 ' \ ' inner join transporters.transporter_table t2 on t1.transporter_id=t2.transporter_id ' \ ' inner join transporters.tc_table t3 on t2.family=t3.tc_id ' \ ' inner join transporters.tc_table t4 on t2.superfamily=t4.tc_id ' \ ' where query_cov>%s and hit_cov>%s and evalue<=%s and bitscore_first_hsp>=%s and t4.description="%s" ' \ ' group by taxon_id,t3.tc_id;' % (biodb, query_cov, hit_cov, evalue, bitscore, family) print(sql) data = server.adaptor.execute_and_fetchall(sql, ) code2taxon2count = {} code2taxon2count['TOTAL'] = {} transporter_list = ['TOTAL'] for row in data: if row[3] not in row[1]: family_label = "%s: %s" % (row[3], row[1]) else: family_label = row[1] row = list(row) if str(row[0]) not in code2taxon2count['TOTAL']: code2taxon2count['TOTAL'][str(row[0])] = int(row[2]) else: code2taxon2count['TOTAL'][str(row[0])] += int(row[2]) if family_label not in transporter_list: transporter_list.append(family_label) if family_label not in code2taxon2count: code2taxon2count[family_label] = {} code2taxon2count[family_label][str(row[0])] = int(row[2]) else: code2taxon2count[family_label][str(row[0])] = int(row[2]) return transporter_list, code2taxon2count
def write_ortho_matrix(bio_db_name): server, db = manipulate_biosqldb.load_db(bio_db_name) matrix = np.array( manipulate_biosqldb.get_orthology_table(server, bio_db_name)) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, bio_db_name) group_names = matrix[:, 0] taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name) import re for i in taxon_id2description.keys(): taxon_id2description[i] = re.sub(" subsp\. aureus", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("strain ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" chromosome", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" DNA", "", taxon_id2description[i]) taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids] #print taxon_id2description f = open("ortho_matrix.tab", "w") f.write("orthogroup\t" + "\t".join(taxons_ids) + "\n") for row in matrix: f.write("\t".join(row) + "\n") f.close()
def get_module_count_all_db(biodb, category=False): ''' :param biodb: <biodatabase name> :param category: KEGG module category (optional) :return: for each module, return the total count from KEGG, and the total count of KO present in the <biodb> ''' import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql_biodb_id = 'select biodatabase_id from biodatabase where name="%s"' % biodb database_id = server.adaptor.execute_and_fetchall(sql_biodb_id, )[0][0] if category: sql_pathway_count = 'select BB.module_name,count_all,count_db,count_db/count_all from (select module_id, count(*) ' \ ' as count_db from (select distinct ko_id from enzyme.locus2ko_%s) as t1' \ ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by module_id) AA ' \ ' right join (select t1.module_id,module_name, count_all from (select module_id, count(*) as count_all ' \ 'from enzyme.module2ko group by module_id) t1 inner join enzyme.kegg_module as t2 ' \ 'on t1.module_id=t2.module_id where module_sub_cat="%s")BB on AA.module_id=BB.module_id;' % (biodb, category) # where pathway_category!="1.0 Global and overview maps" else: # select distinct KO # join with module sql_pathway_count = 'select BB.module_name,count_all,count_db,count_db/count_all from (select module_id, count(*) ' \ ' as count_db from (select distinct ko_id from enzyme.locus2ko_%s) as t1' \ ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by module_id) AA ' \ ' right join (select t1.module_id,module_name, count_all from (select module_id, count(*) as count_all ' \ 'from enzyme.module2ko group by module_id) t1 inner join enzyme.kegg_module as t2 ' \ 'on t1.module_id=t2.module_id)BB on AA.module_id=BB.module_id;' % (biodb) # where pathway_category!="1.0 Global and overview maps" map2count = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql_pathway_count, )) return map2count