def parse_orthofinder_blast_files(biodb, orthofinder_blast_file_list, locus_tag2orthofinder_id, evalue_cutoff=0.01, identity_cutoff=0, sqlite3=False): from chlamdb.biosqldb import manipulate_biosqldb if not sqlite3: server, db = manipulate_biosqldb.load_db(biodb) else: server, db = manipulate_biosqldb.load_db(biodb, sqlite3) sql = 'select locus_tag, taxon_id from annotation.seqfeature_id2locus_%s' % biodb locus_tag2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) locus2taxon2best_hit_id = {} for n, file in enumerate(orthofinder_blast_file_list): print(n, len(orthofinder_blast_file_list)) with open(file, 'r') as f: for line in f: #9_1 60_294 56.647 346 139 3 4 338 5 350 3.33e-141 405 data = line.split('\t') seq_1_id = data[0] seq_2_id = data[1] locus_tag_1 = locus_tag2orthofinder_id[seq_1_id] locus_tag_2 = locus_tag2orthofinder_id[seq_2_id] identity = data[2] evalue = data[10] bitscore = data[11] query_align_length = int(data[7]) - int(data[6]) hit_align_length = int(data[9]) - int(data[8]) taxon_id_seq_2 = locus_tag2taxon_id[locus_tag_2] if locus_tag_1 not in locus2taxon2best_hit_id: locus2taxon2best_hit_id[locus_tag_1] = {} locus2taxon2best_hit_id[locus_tag_1][taxon_id_seq_2] = [ locus_tag_2, identity, evalue, bitscore, query_align_length, hit_align_length ] else: # not the best hit, skip if taxon_id_seq_2 in locus2taxon2best_hit_id[locus_tag_1]: continue # best hit else: locus2taxon2best_hit_id[locus_tag_1][ taxon_id_seq_2] = [ locus_tag_2, identity, evalue, bitscore, query_align_length, hit_align_length ] return locus2taxon2best_hit_id
def create_comparative_tables_accession(db_name, table_name): ''' create a presence/absence matrix based on accession (and not taxon_ids) ''' # create id column + one_column per taxon_id from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) accession_list = get_all_accessions(db_name) sql = "CREATE TABLE comparative_tables.%s_accessions_%s(id VARCHAR(100) NOT NULL" % ( table_name, db_name) for i in accession_list: sql += " ,%s INT" % i #for i in accession_list: # sql+=" ,index %s(%s)" % (i, i) sql += ")" server.adaptor.execute(sql) sql_index = 'CREATE index %s on comparative_tables.%s_accessions_%s(id)' % ( randomString(5), table_name, db_name) server.adaptor.execute(sql_index)
def load_hmm_data(biodb, database_name, table_name, hmm_tab_files): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table %s.%s (locus_tag varchar(400), taxon_id INT, hmm_accession varchar(400));' % (database_name, table_name) server.adaptor.execute_and_fetchall(sql,) sql = 'select locus_tag, taxon_id from biosqldb.orthology_detail_%s' % biodb locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) for hmm_file in hmm_tab_files: print hmm_file with open(hmm_file, 'r') as f: for row in f: if row[0] == '#': continue else: data = row.rstrip().split() locus_tag = data[0] hit_accession = data[3] print '%s\t%s\t%s' % (hit_accession, locus_tag, locus2taxon_id[locus_tag]) sql = 'insert into %s.%s values ("%s", %s, "%s")' % (database_name, table_name,locus_tag, locus2taxon_id[locus_tag], hit_accession) server.adaptor.execute(sql,) server.adaptor.commit()
def blast_sets_form(database_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(database_name) sql = 'select distinct name from blast.blast_sets;' #% database_name categories = server.adaptor.execute_and_fetchall(sql, ) CHOICES = [(i[0], i[0]) for i in categories] CHOICES.append(("all", "all")) class BlastSetChoice(forms.Form): blast_set = forms.MultipleChoiceField( choices=CHOICES, widget=forms.SelectMultiple(attrs={'size': '20'}), required=False) score_cutoff = forms.CharField(max_length=3, label="Bitscore cutoff", initial=10, required=False) query_coverage_cutoff = forms.CharField(max_length=3, label="Query coverage cutoff", initial=0.5, required=False) hit_coverage_cutoff = forms.CharField(max_length=3, label="Query coverage cutoff", initial=0.5, required=False) return BlastSetChoice
def locus2inference_table(biodb): server, db = manipulate_biosqldb.load_db(biodb) sql = 'CREATE TABLE locus_tag2uniprot_hit_%s (locus_tag varchar(400),' \ ' uniprot_id varchar(400), index locus_tag(locus_tag))' % biodb server.adaptor.execute(sql, ) locus2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, biodb) for locus in locus2seqfeature_id: sql = 'select value from seqfeature_qualifier_value where seqfeature_id=%s and value like "%%%%UniProtKB%%%%"' % ( locus2seqfeature_id[locus]) try: data = server.adaptor.execute_and_fetchall(sql, )[0][0] sql2 = 'insert into locus_tag2uniprot_hit_%s values ("%s", "%s")' % ( biodb, locus, data.split(':')[2]) try: server.adaptor.execute(sql2, ) server.commit() except: print sql2 except: pass
def locus2ec_table(hash2ec_dico, biodatabase, hash2locus_list): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables server, db = manipulate_biosqldb.load_db(biodatabase) sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.seqfeature_id2ec_%s (enzyme_id INT AUTO_INCREMENT PRIMARY KEY,' \ ' seqfeature_id INT,' \ ' ec_id INT,' \ ' FOREIGN KEY(ec_id) REFERENCES enzymes(enzyme_id));' % (biodatabase) server.adaptor.execute_and_fetchall(sql2, ) sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodatabase locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) for hash in hash2ec_dico: for ec_data in hash2ec_dico[hash]: for locus in hash2locus_list[hash]: sql = 'select enzyme_id from enzyme.enzymes where ec="%s"' % ec_data[ 0] ec_id = server.adaptor.execute_and_fetchall(sql, )[0][0] seqfeature_id = locus_tag2seqfeature_id[locus] sql = 'insert into enzyme.seqfeature_id2ec_%s (seqfeature_id, ec_id) values (%s, %s)' % ( biodatabase, seqfeature_id, ec_id) server.adaptor.execute(sql, ) server.commit()
def make_kegg_form(database_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(database_name) sql_pathways = 'select description,description from enzyme.locus2ko_%s t1 ' \ ' inner join enzyme.pathway2ko t2 ' \ ' on t1.ko_id = t2.ko_id ' \ ' inner join enzyme.kegg_pathway t3 ' \ ' on t3.pathway_id=t2.pathway_id group by description;' % (database_name) pathway_choices = server.adaptor.execute_and_fetchall(sql_pathways, ) sql_modules = 'select description,description from enzyme.locus2ko_%s t1 ' \ ' inner join enzyme.module2ko t2 on t1.ko_id = t2.ko_id ' \ ' inner join enzyme.kegg_module t3 on t3.module_id=t2.module_id group by description;' % database_name module_choices = server.adaptor.execute_and_fetchall(sql_modules, ) class KeggForm(forms.Form): pathway_choice = forms.MultipleChoiceField( label='', choices=pathway_choices, widget=forms.SelectMultiple(attrs={'size': '%s' % "17"}), required=False) module_choice = forms.MultipleChoiceField( choices=module_choices, widget=forms.SelectMultiple(attrs={'size': '%s' % "17"}), required=False, label="") return KeggForm
def find_links_recusrsive(biodb, all_connected_seqfeatures, ratio_cutoff=0.5, n_comp_cutoff=1): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) if biodb == 'chlamydia_04_16': filter = '"' + '","'.join(all_connected_seqfeatures) + '"' sql = 'select locus_1, locus_2 from interactions.colocalization_table_locus_%s where (locus_1 in (%s) or locus_2 in (%s)) and ' \ ' (ratio >= %s and n_comparisons >= %s)' % (biodb, filter, filter, ratio_cutoff, n_comp_cutoff) else: all_connected_seqfeatures = [str(i) for i in all_connected_seqfeatures] filter = ','.join(all_connected_seqfeatures) sql = 'select locus_1, locus_2 from interactions.colocalization_table_locus_%s where (locus_1 in (%s) or locus_2 in (%s)) and ' \ ' (ratio >= %s and n_comparisons >= %s)' % (biodb, filter, filter, ratio_cutoff, n_comp_cutoff) #print sql data = server.adaptor.execute_and_fetchall(sql,) all_groups = [] for i in data: if i[0] not in all_groups: all_groups.append(i[0]) if i[1] not in all_groups: all_groups.append(i[1]) if len(all_groups) > len(all_connected_seqfeatures): return find_links_recusrsive(biodb, all_groups, ratio_cutoff, n_comp_cutoff) else: return all_groups
def add_average_orthogroup_identity(self, biodatabase_name): #print 'adding average id to orthology table' server, db = manipulate_biosqldb.load_db(biodatabase_name) #print 'get orthogroups' sql = 'select orthogroup from comparative_tables.orthology_%s' % biodatabase_name try: #print 'adding column' self._create_orthogroup_average_identity_column(server, biodatabase_name) except: print ("column already created?") groups = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )] #print len(groups) for group in groups: #print "group %s" % group try: id_table = np.array(get_orthogroup_identity_table(biodatabase_name, group)) id_matrix = id_table[:,1:].astype(float) #print np.mean(id_matrix) #print self._get_average_identity_from_identity_matrix(id_matrix) av_id = round(np.mean(id_matrix[np.triu_indices(len(id_matrix), k=1)]), 2) #print av_id except: av_id = 0 sql = 'insert into orth_%s.average_identity values ("%s", %s)' % (biodatabase_name, group, av_id) #print sql server.adaptor.execute(sql) server.commit()
def insert_new_tc_path(biodb, tc_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) separated_ids = tc_name.split('.') if len(separated_ids) != 5: print 'invalid id!' import sys sys.exit() else: id1 = separated_ids[0] id_superfamily = '.'.join(separated_ids[0:2]) id_family = '.'.join(separated_ids[0:3]) id_subfamily = '.'.join(separated_ids[0:4]) id_complete = tc_name id_id1 = add_one_tc_id(biodb, id1) id_superfamily = add_one_tc_id(biodb, id_superfamily) id_family = add_one_tc_id(biodb, id_family) id_subfamily = add_one_tc_id(biodb, id_subfamily) id_complete = add_one_tc_id(biodb, id_complete) sql = 'select transporter_id from transporters.transporter_table where transporter_id=%s' % id_complete try: id = server.adaptor.execute_and_fetchall(sql, )[0][0] return id except: sql = 'insert into transporters.transporter_table (transporter_id, tc1,superfamily, family, subfamily) values ("%s",' \ ' "%s","%s","%s","%s")' % (id_complete, id_id1, id_superfamily, id_family, id_subfamily) server.adaptor.execute(sql, ) server.commit() return id_complete
def locus2ko_table(locus_tag2ko_dico, biodatabase, ko_accession2ko_id): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodatabase) sql2 = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodatabase locus2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2)) sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.seqfeature_id2ko_%s (seqfeature_id INT,' \ ' ko_id INT, ' \ ' index ko_id (ko_id),' \ ' index seqid (seqfeature_id));' % (biodatabase) server.adaptor.execute_and_fetchall(sql2, ) for locus in locus_tag2ko_dico: ko = locus_tag2ko_dico[locus] ko_id = ko_accession2ko_id[ko] seqfeature_id = locus2seqfeature_id[locus] sql = 'insert into enzyme.seqfeature_id2ko_%s (seqfeature_id, ko_id) values (%s, %s)' % ( biodatabase, seqfeature_id, ko_id) server.adaptor.execute(sql, ) server.commit()
def import_phylo(phylo_list, biodb): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables import ete3 import re server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table IF NOT EXISTS biosqldb_phylogenies.BBH_%s (orthogroup varchar(100), phylogeny text, INDEX orthogroup (orthogroup));' % biodb server.adaptor.execute(sql, ) locuslag2orthogroup = biosql_own_sql_tables.locus_tag2orthogroup(biodb) l = len(phylo_list) for n, phylo in enumerate(phylo_list): print("%s/%s" % (n, l)) t = ete3.Tree(phylo, format=0) leaves = [i for i in t.iter_leaves()] for leave in leaves: try: orthogroup = locuslag2orthogroup[leave.name] break except: continue sql = 'insert into biosqldb_phylogenies.BBH_%s values ("%s", "%s");' % ( biodb, orthogroup, t.write()) try: server.adaptor.execute(sql, ) except: print(phylo) server.commit()
def get_mysql_table(db_name, table_name): import numpy as np server, db = manipulate_biosqldb.load_db(db_name) all_taxons_id = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_taxons = "id, " for i in range(0, len(all_taxons_id) - 1): sql_taxons += ' `%s`,' % all_taxons_id[i] sql_taxons += ' `%s`' % all_taxons_id[-1] sql = "select %s from comparative_tables.%s_%s" % (sql_taxons, table_name, db_name) mat = np.array(server.adaptor.execute_and_fetchall(sql, )) f = open("%s_matrix.tab" % table_name, "w") taxonid2genome = manipulate_biosqldb.taxon_id2genome_description( server, db_name, True) taxons_ids = [taxonid2genome[int(i)] for i in all_taxons_id] f.write('"id"\t"' + '"\t"'.join(taxons_ids) + '"\n') for row in mat: row = [str(i) for i in row] f.write("\t".join(row) + "\n")
def pathway_list2profile_dico(biodb, pathway_list, taxon_id_list=[], group_by_KO=True): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb db_id = server.adaptor.execute_and_fetchall(sql, )[0][0] filter_2 = '"' + '","'.join(pathway_list) + '"' # RESTRICT TO AS SUBSET OF THE TAXON AVAILABLE if len(taxon_id_list) > 0: filter = ','.join(taxon_id_list) sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,description,t3.pathway_id,t1.ko_id ' \ ' from enzyme.locus2ko_%s t1 inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id ' \ ' inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id ' \ ' where t3.pathway_name in (%s) and taxon_id in (%s)) A group by taxon_id,pathway_id;' % (biodb, filter_2, filter) print(sql) else: if not group_by_KO: print('not grouping') sql = 'select taxon_id,description,count(*) from (select distinct taxon_id,description,t3.pathway_id,t1.ko_id ' \ ' from enzyme.locus2ko_%s t1 inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id ' \ ' inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id ' \ ' where t3.pathway_name in (%s)) A group by taxon_id,pathway_id;' % (biodb, filter_2) else: print('grouping') sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,description,t4.ko_accession ' \ ' from enzyme.seqfeature_id2ko_%s t1 ' \ ' inner join annotation.seqfeature_id2locus_%s tb on t1.seqfeature_id=tb.seqfeature_id ' \ ' inner join enzyme.pathway2ko t2 on t1.ko_id=t2.ko_id inner join enzyme.kegg_pathway as t3 on t2.pathway_id=t3.pathway_id' \ ' inner join enzyme.ko_annotation t4 on t1.ko_id=t4.ko_id where pathway_name in (%s) ) A ' \ ' group by A.taxon_id, A.description;;' % (biodb, biodb, filter_2) print(sql) #print sql data = server.adaptor.execute_and_fetchall(sql, ) code2taxon2count = {} pathway_list = [] for row in data: row = list(row) #print row if row[1] not in pathway_list: pathway_list.append(row[1]) if row[1] not in code2taxon2count: code2taxon2count[row[1]] = {} code2taxon2count[row[1]][str(row[0])] = int(row[2]) else: code2taxon2count[row[1]][str(row[0])] = int(row[2]) return pathway_list, code2taxon2count
def vcf2heatmap(vcf_file, biodb ="saureus_01_15", output="heat_ksnp_SaC.pdf"): my_array, sample_names = parse_vcf(vcf_file) server, db = manipulate_biosqldb.load_db(biodb) accession2description = manipulate_biosqldb.accession2description(server, biodb) for i in range(0, len(sample_names)): try: sample_names[i] = accession2description[sample_names[i]] except: pass M = my_array[:, 0:] #print M[0:10,:] # tri en fonction de la localisation #M = M[M[:, 0].argsort()] core = 0 core_locations = [] non_core = 0 for index, value in enumerate(M[:,1:].sum(axis=1)): if value > 50: core+=1 core_locations.append(int(M[index,0])) else: non_core+=1 print "core", core, len(core_locations) print "non core", non_core print core/float((core+non_core)) #M = heatmap.randomize_table(M[:,1:]) #heatmap.heatmap_ksnp(M, format='png', output=output, breaks="-0.5, 0.5, 5, 99", rows=None, columns = sample_names,orderRows = False) return core_locations
def load_sport_data(psortdb_output_files, db_name, hash2locus_list): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) hash2psort_results = {} for psortdb_output in psortdb_output_files: hash2psort_results.update(parse_psort_results(psortdb_output)) sql = f'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_{db_name}' locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) sql = f'create table if not exists custom_tables.seqfeature_id2psortb_{db_name} (seqfeature_id INT, final_prediction varchar(200), score FLOAT)' server.adaptor.execute(sql,) sql_template = f'insert into custom_tables.seqfeature_id2psortb_{db_name} values (%s, %s, %s)' for hash in hash2psort_results: for locus in hash2locus_list[hash]: final_pred = hash2psort_results[hash]['final_prediction'] if final_pred == 'Unknown': score = None else: score = hash2psort_results[hash][final_pred] server.adaptor.execute(sql_template, (locus_tag2seqfeature_id[locus], final_pred, score)) sql = f'create index sfp on custom_tables.seqfeature_id2psortb_{db_name}(seqfeature_id)' server.adaptor.execute(sql,) server.commit()
def del_blastnr_table_content(db_name): server, db = manipulate_biosqldb.load_db(db_name) sql = 'select accession from bioentry' \ ' inner join biodatabase on bioentry.biodatabase_id=biodatabase.biodatabase_id where biodatabase.name="%s"' % db_name all_accessions = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )] for accession in all_accessions: sql1 = 'DROP TABLE IF EXISTS blastnr.blastnr_hsps_%s_%s' % (db_name, accession) sql2 = 'DROP TABLE IF EXISTS blastnr.blastnr_hits_%s_%s' % (db_name, accession) sql3 = 'DROP TABLE IF EXISTS blastnr.blastnr_hits_taxonomy_%s_%s' % ( db_name, accession) sql4 = 'DROP TABLE IF EXISTS blastnr.blastnr_hits_taxonomy_filtered_%s_%s' % ( db_name, accession) print(sql1) server.adaptor.execute(sql1) server.adaptor.commit() print(sql3) server.adaptor.execute(sql3) server.adaptor.commit() print(sql4) server.adaptor.execute(sql4) server.adaptor.commit() print(sql2) server.adaptor.execute(sql2) server.adaptor.commit()
def taxon2module2count(biodb, category=False): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) if category: # C.pathway_category,taxon_id, A.pathway_name,A.n_enzymes, C.description # select distinct KO id # join with module to get counts/modules sql = 'select B.module_sub_cat,A.taxon_id,B.module_name,A.n,B.description from ' \ ' (select taxon_id, module_id, count(*) as n from ' \ ' (select distinct taxon_id,ko_id from enzyme.locus2ko_%s) t1 ' \ ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by taxon_id, module_id) A ' \ ' inner join enzyme.kegg_module as B on A.module_id=B.module_id where module_sub_cat="%s";' % (biodb, category) else: sql = 'select B.module_sub_cat,A.taxon_id,B.module_name,A.n,B.description from ' \ ' (select taxon_id, module_id, count(*) as n from ' \ ' (select distinct taxon_id,ko_id from enzyme.locus2ko_%s) t1 ' \ ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by taxon_id, module_id) A ' \ ' inner join enzyme.kegg_module as B on A.module_id=B.module_id;' % (biodb) pathway_data = server.adaptor.execute_and_fetchall(sql, ) taxon2module2c = {} # 0 sub cat # 1 taxon_id # 2 module name # 3 count # 4 description for row in pathway_data: if row[1] not in taxon2module2c: taxon2module2c[row[1]] = {} taxon2module2c[row[1]][row[2]] = row[3] else: taxon2module2c[row[1]][row[2]] = row[3] return taxon2module2c
def locus2ko_table_legacy(biodatabase, hash2ko, hash2locus_list): # create legacy table locus2ko # TODO: remove all depenancies to this table # content: ''' taxon_id, locus_tag, orthogroup, ko_id (not numerical id but KO*) ''' from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodatabase) sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodatabase sql2 = 'select locus_tag, orthogroup from orthology_detail_%s' % biodatabase locus2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql)) locus2orthogroup = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2)) sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.locus2ko_%s (taxon_id INT,'\ ' locus_tag VARCHAR(200),' \ ' orthogroup varchar(200),' \ ' ko_id VARCHAR(200), index taxon_id (taxon_id), index ko_id (ko_id));' % (biodatabase) server.adaptor.execute_and_fetchall(sql2, ) for hash in hash2ko: for locus in hash2locus_list[hash]: ko = hash2ko[hash]["KO"] sql = 'insert into enzyme.locus2ko_%s (taxon_id, locus_tag, orthogroup, ko_id) values ("%s", "%s", "%s", "%s")' % ( biodatabase, locus2taxon_id[locus], locus, locus2orthogroup[locus], ko) server.adaptor.execute(sql, ) server.commit()
def load_reference_phylogeny(db_name, newick_file, gbk_files): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.plots import parse_newick_tree file_name2taxid = gbk2taxid(gbk_files, db_name) print(file_name2taxid) server, db = manipulate_biosqldb.load_db(db_name) sql = 'select biodatabase_id from biodatabase where name="%s"' % (db_name) db_id = server.adaptor.execute_and_fetchall(sql, )[0][0] sql = 'create table if not exists reference_phylogeny (biodatabase_id INT, tree TEXT)' server.adaptor.execute(sql, ) with open(newick_file, 'r') as f: rows = [i for i in f] if len(rows) > 1: raise IOError("wrong tree format (need newick)") newick_string = rows[0].rstrip() new_tree = parse_newick_tree.convert_terminal_node_names( newick_string, file_name2taxid, False) sql = 'insert into reference_phylogeny values( %s, "%s")' % ( db_id, new_tree.write()) server.adaptor.execute(sql, ) server.commit()
def import_eggnog(eggnog_file, biodb): from chlamdb.biosqldb import manipulate_biosqldb import biosql_own_sql_tables import ete2 import re server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table IF NOT EXISTS COG.eggnog_hits_%s (query_seq varchar(100), ' \ ' best_hit_eggNOG_ortholog varchar(400),' \ ' best_hit_evalue FLOAT,' \ ' best_hit_score FLOAT,' \ ' predicted_name varchar(100),' \ ' index query_seq(query_seq));' % biodb print sql server.adaptor.execute(sql,) with open(eggnog_file, 'r') as f: for n, one_hit in enumerate(f): if n == 0: continue else: data = one_hit.rstrip().split('\t') if data[1] == '-': continue else: sql = 'insert into COG.eggnog_hits_%s values ("%s", "%s", %s, %s, "%s");' % (biodb, data[0], data[1], data[2], data[3], data[4]) print sql server.adaptor.execute(sql,) server.commit()
def get_all_orthogroup_protein_fasta(server, biodatabase_name, out_dir): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodatabase_name) sql = 'select orthogroup from orthology_detail_%s group by orthogroup' % biodatabase_name all_groups = [i[0] for i in server.adaptor.execute_and_fetchall(sql, ) ] # get_all_orthogroup_size(server, biodatabase_name).keys() #all_groups2 = list(get_all_orthogroup_size(server, biodatabase_name).keys()) #print all_groups2[0:5], all_groups[0:5] #print 'diff', list(set(all_groups2) - set(all_groups)) #out_q = Queue() n_cpu = 8 n_poc_per_list = int(numpy.ceil(len(all_groups) / float(n_cpu))) query_lists = _chunks(all_groups, n_poc_per_list) procs = [] for one_list in query_lists: proc = Process(target=get_one_group_data, args=(one_list, biodatabase_name, out_dir)) #, out_q)) procs.append(proc) proc.start() print "join proc" time.sleep(5) for proc in procs: proc.join()
def import_phylo(phylo_list, biodb): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables import ete3 import re server, db = manipulate_biosqldb.load_db(biodb) sql = 'create database if not exists biosqldb_phylogenies' server.adaptor.execute(sql, ) sql = 'create table IF NOT EXISTS biosqldb_phylogenies.%s (orthogroup varchar(100), phylogeny longtext);' % biodb server.adaptor.execute(sql, ) locuslag2orthogroup = biosql_own_sql_tables.locus_tag2orthogroup(biodb) l = len(phylo_list) for n, phylo in enumerate(phylo_list): print("%s/%s" % (n, l)) t = ete3.Tree(phylo) leaves = [i for i in t.iter_leaves()] orthogroup = locuslag2orthogroup[leaves[0].name] #print t.write() sql = 'insert into biosqldb_phylogenies.%s values ("%s", "%s");' % ( biodb, orthogroup, t.write()) #print sql server.adaptor.execute(sql, ) server.commit()
def load_cog_names_table(table_file): ''' create a table with distinct entries for cogs belonging to multiple categories (i.e NZ_CP007053 | COG2197 | TK) :param table_file: :return: ''' import re from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db('chlamydia_04_16') sql = 'CREATE table COG.cog_names_2014 (COG_id varchar(100),' \ 'functon varchar(10),' \ 'name varchar(200), index COG_id(COG_id))' h = re.compile('^#.*') server.adaptor.execute_and_fetchall(sql, ) with open(table_file, 'r') as f: for line in f: if re.match(h, line): continue else: data = line.rstrip().split('\t') cog = data[0] all_funct = list(data[1]) name = re.sub('"', '', data[2]) for one_category in all_funct: sql = 'insert into COG.cog_names_2014 (COG_id, functon, name) values ("%s", "%s", "%s")' % ( cog, one_category, name) print sql server.adaptor.execute(sql, ) server.commit()
def module_list2profile_dico(biodb, module_list, taxon_id_list=[]): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb db_id = server.adaptor.execute_and_fetchall(sql, )[0][0] filter_2 = '"' + '","'.join(module_list) + '"' # RESTRICT TO AS SUBSET OF THE TAXON AVAILABLE if len(taxon_id_list) > 0: filter = ','.join(taxon_id_list) sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,t1.ko_id,module_name,description ' \ ' from enzyme.seqfeature_id2ko_%s t1' \ ' inner join enzyme.module2ko t2 on t1.ko_id=t2.ko_id ' \ ' inner join enzyme.kegg_module as t3 on t2.module_id=t3.module_id ' \ ' inner join annotation.seqfeature_id2locus_%s t4 ' \ ' on t1.seqfeature_id=t4.seqfeature_id' \ ' where module_name in (%s) and taxon_id in (%s)) A group by A.taxon_id,A.module_name;' % (biodb, biodb, filter_2, filter) else: sql = 'select taxon_id, description, count(*) from (select distinct taxon_id,t1.ko_id,module_name,description ' \ ' from enzyme.seqfeature_id2ko_%s t1' \ ' inner join enzyme.module2ko t2 on t1.ko_id=t2.ko_id ' \ ' inner join enzyme.kegg_module as t3 on t2.module_id=t3.module_id ' \ ' inner join annotation.seqfeature_id2locus_%s t4 ' \ ' on t1.seqfeature_id=t4.seqfeature_id' \ ' where module_name in (%s)) A group by A.taxon_id,A.module_name;' % (biodb, biodb, filter_2) print(sql) data = server.adaptor.execute_and_fetchall(sql, ) code2taxon2count = {} module_list = [] for row in data: row = list(row) ''' split_name = row[1].split('=>') if len(split_name[0]) >0: row[1] = split_name[0] ''' #else: if row[1] not in module_list: module_list.append(row[1]) if row[1] not in code2taxon2count: code2taxon2count[row[1]] = {} code2taxon2count[row[1]][str(row[0])] = int(row[2]) else: code2taxon2count[row[1]][str(row[0])] = int(row[2]) return module_list, code2taxon2count
def create_taxon_link_table(biodb): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table interactions.colocalization_taxons_table_%s (group_1 varchar(200), group_2 varchar(200), taxon_1 INT,' \ 'taxon_2 INT)' % biodb server.adaptor.execute(sql, )
def create_group_link_table(biodb): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table interactions.colocalization_table_%s (group_1 varchar(200), group_2 varchar(200), n_links INT,' \ 'n_comparisons INT, ratio float)' % biodb server.adaptor.execute(sql, )
def load_PMID(PMID_db_path, hash2locus_list, db_name): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables import sqlite3 sqlite_conn = sqlite3.connect(PMID_db_path) sqlite_cursor = sqlite_conn.cursor() server, db = manipulate_biosqldb.load_db(db_name) sql1 = 'create table string.seqfeature_id2pmid_%s (seqfeature_id INTEGER, pmid INTEGER);' % db_name sql2 = 'create table if not exists string.pmid2data (pmid INTEGER, title TEXT, authors TEXT, source TEXT, abstract TEXT);' server.adaptor.execute(sql1, ) server.adaptor.execute(sql2, ) # get locus_tag2seqfeaute_id dictionnary sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % db_name locus_tag2seqfeaure_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) # retrieve existing pmid data (if any) sql = 'select pmid from string.pmid2data ' pmid_already_in_db = [ str(i[0]) for i in server.adaptor.execute_and_fetchall(sql, ) ] sql = 'select * from hash2pmid' hash_and_pmid = sqlite_cursor.execute(sql, ).fetchall() sql = 'select * from pmid2data' pmid2data = manipulate_biosqldb.to_dict( sqlite_cursor.execute(sql, ).fetchall()) template1 = 'insert into string.pmid2data values (%s, %s, %s, %s, %s)' template2 = 'insert into string.seqfeature_id2pmid_%s' % db_name + ' values(%s, %s)' for n, row in enumerate(hash_and_pmid): hash, pmid = row locus_list = hash2locus_list[hash] if pmid not in pmid_already_in_db: try: server.adaptor.execute(template1, [pmid] + list(pmid2data[str(pmid)])) pmid_already_in_db.append(pmid) except KeyError: print("Problem with pmid %s" % pmid) for locus_tag in locus_list: server.adaptor.execute(template2, [locus_tag2seqfeaure_id[locus_tag], pmid]) server.commit() sql1 = 'create index sf on string.seqfeature_id2pmid_%s(seqfeature_id)' % db_name sql2 = 'create index pm1 on string.seqfeature_id2pmid_%s(pmid)' % db_name sql3 = 'create index pm2 on string.pmid2data (pmid)' server.adaptor.execute(sql1, ) server.adaptor.execute(sql2, ) server.adaptor.execute(sql3, )
def get_all_accessions(db_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) sql = 'select accession from bioentry t1 inner join biodatabase t2 on t1.biodatabase_id=t2.biodatabase_id ' \ ' where t2.name="%s"' % db_name accession_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )] return accession_list
def orthogroup2nucleotide_seq_list(locus_tag2nucl_sequence, group, biodb_name): server = manipulate_biosqldb.load_db() locus_list = manipulate_biosqldb.orthogroup_id2locus_tag_list( server, group, biodb_name) seqs = [] for locus in locus_list: seq = locus_tag2nucl_sequence[str(locus[2])] seqs.append(locus_tag2nucl_sequence[str(locus[2])]) return seqs