def locus2ko_table_legacy(biodatabase, hash2ko, hash2locus_list): # create legacy table locus2ko # TODO: remove all depenancies to this table # content: ''' taxon_id, locus_tag, orthogroup, ko_id (not numerical id but KO*) ''' from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodatabase) sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodatabase sql2 = 'select locus_tag, orthogroup from orthology_detail_%s' % biodatabase locus2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql)) locus2orthogroup = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2)) sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.locus2ko_%s (taxon_id INT,'\ ' locus_tag VARCHAR(200),' \ ' orthogroup varchar(200),' \ ' ko_id VARCHAR(200), index taxon_id (taxon_id), index ko_id (ko_id));' % (biodatabase) server.adaptor.execute_and_fetchall(sql2, ) for hash in hash2ko: for locus in hash2locus_list[hash]: ko = hash2ko[hash]["KO"] sql = 'insert into enzyme.locus2ko_%s (taxon_id, locus_tag, orthogroup, ko_id) values ("%s", "%s", "%s", "%s")' % ( biodatabase, locus2taxon_id[locus], locus, locus2orthogroup[locus], ko) server.adaptor.execute(sql, ) server.commit()
def blastswiss2biosql(locus_tag2seqfeature_id, db_name, n_procs, mysql_host, mysql_user, mysql_pwd, mysql_db, *input_blast_files): import numpy from multiprocessing import Process create_sql_blast_swissprot_tables(db_name, mysql_host, mysql_user, mysql_pwd, mysql_db) # load blast data into blastnr_ db_name table n_cpu = n_procs n_poc_per_list = int(numpy.ceil(len(input_blast_files) / float(n_cpu))) query_lists = _chunks(input_blast_files, n_poc_per_list) print('n lists: %s' % len(query_lists)) sql = 'create table if not exists blastnr.gi2taxon_and_description (gi INT, taxon_id INT, description TEXT,' \ ' taxonomy TEXT, source TEXT);' server.adaptor.execute(sql, ) print('get locus2taxon_id') sql = 'select locus_tag, taxon_id from orthology_detail_%s' % db_name locus_tag2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) print('get locus2bioentry') sql2 = 'select locus_tag,bioentry_id from biodatabase t1 ' \ ' inner join bioentry as t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' inner join orthology_detail_%s t3 on t2.accession=t3.accession where t1.name="%s"' % (db_name,db_name) locus_tag2bioentry_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2, )) if len(input_blast_files) > n_poc_per_list: procs = [] for one_list in query_lists: proc = Process(target=load_blastswissprot_file_into_db, args=(locus_tag2taxon_id, locus_tag2seqfeature_id, locus_tag2bioentry_id, mysql_host, mysql_user, mysql_pwd, mysql_db, one_list, biodb)) procs.append(proc) proc.start() # Wait for all worker processes to finish for proc in procs: proc.join() else: print( 'Only %s input blast file(s) for %s cpus, not working in paralell mode' % (len(input_blast_files), n_poc_per_list)) load_blastswissprot_file_into_db(locus_tag2taxon_id, locus_tag2seqfeature_id, locus_tag2bioentry_id, mysql_host, mysql_user, mysql_pwd, mysql_db, input_blast_files, biodb) sys.stdout.write("done!")
def load_PMID(PMID_db_path, hash2locus_list, db_name): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables import sqlite3 sqlite_conn = sqlite3.connect(PMID_db_path) sqlite_cursor = sqlite_conn.cursor() server, db = manipulate_biosqldb.load_db(db_name) sql1 = 'create table string.seqfeature_id2pmid_%s (seqfeature_id INTEGER, pmid INTEGER);' % db_name sql2 = 'create table if not exists string.pmid2data (pmid INTEGER, title TEXT, authors TEXT, source TEXT, abstract TEXT);' server.adaptor.execute(sql1, ) server.adaptor.execute(sql2, ) # get locus_tag2seqfeaute_id dictionnary sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % db_name locus_tag2seqfeaure_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) # retrieve existing pmid data (if any) sql = 'select pmid from string.pmid2data ' pmid_already_in_db = [ str(i[0]) for i in server.adaptor.execute_and_fetchall(sql, ) ] sql = 'select * from hash2pmid' hash_and_pmid = sqlite_cursor.execute(sql, ).fetchall() sql = 'select * from pmid2data' pmid2data = manipulate_biosqldb.to_dict( sqlite_cursor.execute(sql, ).fetchall()) template1 = 'insert into string.pmid2data values (%s, %s, %s, %s, %s)' template2 = 'insert into string.seqfeature_id2pmid_%s' % db_name + ' values(%s, %s)' for n, row in enumerate(hash_and_pmid): hash, pmid = row locus_list = hash2locus_list[hash] if pmid not in pmid_already_in_db: try: server.adaptor.execute(template1, [pmid] + list(pmid2data[str(pmid)])) pmid_already_in_db.append(pmid) except KeyError: print("Problem with pmid %s" % pmid) for locus_tag in locus_list: server.adaptor.execute(template2, [locus_tag2seqfeaure_id[locus_tag], pmid]) server.commit() sql1 = 'create index sf on string.seqfeature_id2pmid_%s(seqfeature_id)' % db_name sql2 = 'create index pm1 on string.seqfeature_id2pmid_%s(pmid)' % db_name sql3 = 'create index pm2 on string.pmid2data (pmid)' server.adaptor.execute(sql1, ) server.adaptor.execute(sql2, ) server.adaptor.execute(sql3, )
def get_species_data(server, biodb): from chlamdb.biosqldb import manipulate_biosqldb """ for each species, report - number of complete genomes (1 contig) - number of draft genomes (>1 contig) - completeness range - ideally distinguish metagenomes from cultured representatives """ sql1 = f''' select t3.species,count(*) as n_complete from biosqldb.bioentry t1 inner join biosqldb.taxid2species_{biodb} t2 on t1.taxon_id=t2.taxon_id inner join biosqldb.species_curated_taxonomy_{biodb} t3 on t2.species_id=t3.species_id inner join biosqldb.genomes_info_{biodb} t4 on t1.accession=t4.ACCESSION where t4.n_contigs=1 and t1.description not like "%%%%plasmid%%%%" group by species; ''' sql2 = f''' select t3.species,count(*) as n_incomplete from biosqldb.bioentry t1 inner join biosqldb.taxid2species_{biodb} t2 on t1.taxon_id=t2.taxon_id inner join biosqldb.species_curated_taxonomy_{biodb} t3 on t2.species_id=t3.species_id inner join biosqldb.genomes_info_{biodb} t4 on t1.accession=t4.ACCESSION where t4.n_contigs>1 and t1.description not like "%%%%plasmid%%%%" group by species; ''' sql3 = f''' select t3.species, t5.completeness,t4.n_contigs from biosqldb.bioentry t1 inner join biosqldb.taxid2species_2019_06_PVC t2 on t1.taxon_id=t2.taxon_id inner join biosqldb.species_curated_taxonomy_2019_06_PVC t3 on t2.species_id=t3.species_id inner join biosqldb.genomes_info_2019_06_PVC t4 on t1.accession=t4.ACCESSION inner join custom_tables.checkm_2019_06_PVC t5 on t1.taxon_id=t5.taxon_id where t1.description not like "%%%%plasmid%%%%"; ''' species2n_complete_genomes = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,)) species2n_draft_genomes = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,)) species2completeness = {} for row in server.adaptor.execute_and_fetchall(sql3,): species, completeness, n_contigs = row completeness = float(completeness) if int(n_contigs) == 1: continue if species not in species2completeness: species2completeness[species] = [completeness, completeness] else: if completeness < species2completeness[species][0]: print("smaller!") species2completeness[species][0] = completeness if completeness > species2completeness[species][1]: print("Larger") species2completeness[species][1] = completeness print(species2completeness) return species2n_complete_genomes, species2n_draft_genomes, species2completeness
def biodb2pairwise_dist_phylogenies(biodb): from ete2 import Tree from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select phylogeny from biosqldb_phylogenies.%s' % biodb all_phylogenies = [ i[0] for i in server.adaptor.execute_and_fetchall(sql, ) ] sql = 'select seqfeature_id, taxon_id from custom_tables.locus2seqfeature_id_%s' % biodb seqfeature_id2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'create table if not exists biosqldb_phylogenies.pairwise_leaf_dist_%s (taxon_1 INT, ' \ ' taxon_2 INT, ' \ ' seqfeature_1 INT, ' \ ' seqfeature_2 INT, ' \ ' distance FLOAT, ' \ ' index taxon_1(taxon_1), ' \ ' index taxon_2(taxon_2), ' \ ' index seqfeature_1(seqfeature_1),' \ ' index seqfeature_2(seqfeature_2), ' \ ' index distance(distance))' % (biodb) server.adaptor.execute(sql, ) for c, phylogeny in enumerate(all_phylogenies): t = Tree(phylogeny) leaf_list = [i for i in t.iter_leaves()] print '%s / %s -- %s' % (c, len(all_phylogenies), len(leaf_list)) for n, l1 in enumerate(leaf_list): #print "%s / %s" % (n, len(leaf_list)) for l2 in leaf_list[n + 1:]: seqid_1 = locus_tag2seqfeature_id[str(l1.name)] seqid_2 = locus_tag2seqfeature_id[str(l2.name)] taxon_1 = seqfeature_id2taxon_id[str(seqid_1)] taxon_2 = seqfeature_id2taxon_id[str(seqid_2)] sql = 'insert into biosqldb_phylogenies.pairwise_leaf_dist_%s values (%s, %s, %s, %s, %s)' % ( biodb, taxon_1, taxon_2, seqid_1, seqid_2, l1.get_distance(l2)) server.adaptor.execute(sql, ) server.commit()
def load_locus2cog_into_sqldb_legacy(input_blast_files, biodb, cdd_id2cog_id, hash2locus_tag_list): import MySQLdb import os from chlamdb.biosqldb import manipulate_biosqldb mysql_host = 'localhost' mysql_user = '******' mysql_pwd = os.environ['SQLPSW'] mysql_db = 'COG' conn = MySQLdb.connect(host=mysql_host, user=mysql_user, passwd=mysql_pwd, db=mysql_db) cursor = conn.cursor() sql = 'create table COG.locus_tag2gi_hit_%s (accession varchar(100), locus_tag varchar(100), gi INT, COG_id varchar(100),' \ 'index locus_tag (locus_tag), index accession (accession))' % biodb cursor.execute(sql) conn.commit() sql = 'select locus_tag, accession from orthology_detail_%s' % biodb sql3 = 'select protein_id, COG_id from COG.cog_2014;' sql2 = 'select protein_id, locus_tag from orthology_detail_%s' % biodb sql5 = 'select locus_tag,length(translation) from orthology_detail_%s;' % biodb server, db = manipulate_biosqldb.load_db(biodb) locus2genome_accession = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql)) protein_id2locus_tag = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2)) protein_id2COG = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql3)) locus_tag2protein_length = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql5)) for input_blast in input_blast_files: locus2data = blast2COG(input_blast, hash2locus_tag_list, cdd_id2cog_id, cog_id2length, locus_tag2protein_length) for locus in locus2data: sql = 'INSERT into locus_tag2gi_hit_%s (accession, locus_tag, gi, COG_id) VALUES ("%s", "%s", %s, "%s")' % ( biodb, locus2genome_accession[locus], locus, 0, # no gi anymore locus2data[locus]["cog_id"]) cursor.execute(sql) conn.commit()
def collect_Pfam_accession(db_name): ''' collect presence/absence of Pfam domains for each genome/plasmid accession :param db_name: :return: ''' from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) sql_head = 'INSERT INTO comparative_tables.Pfam_accessions_%s (id,' % db_name accession_list = get_all_accessions(db_name) sql_head += ','.join(accession_list) + ') values (' all_pfam_ids_sql = 'select signature_accession from interpro_%s where analysis="Pfam" ' \ 'group by signature_accession;' % db_name all_pfam_ids = [ i[0] for i in server.adaptor.execute_and_fetchall(all_pfam_ids_sql, ) ] sql = 'select t1.accession, t1.description from bioentry t1 inner join biodatabase t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' where t2.name="%s"' % db_name accession2organism = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) i = 0 for accession in all_pfam_ids: print(i, '/', len(all_pfam_ids), accession) i += 1 sql= 'select organism, count(*) from biosqldb.interpro_%s ' \ ' where analysis="Pfam" and signature_accession="%s" group by organism;' % (db_name, accession) data = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql_temp = sql_head + '"%s",' % accession for accession in accession_list: try: sql_temp += '%s,' % data[accession2organism[str(accession)]] except: sql_temp += '0,' sql_temp = sql_temp[0:-1] + ');' server.adaptor.execute(sql_temp, ) server.adaptor.commit()
def best_blast_hit_majority_species(biodb): ''' 1. for each protein, get the species of the best hit, and calculate the proportion of the proteome it represent :param biodb: :param n_hits: :return: ''' from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'select taxon_id from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' where t1.name="%s" group by taxon_id' % biodb taxon_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )] sql = 'CREATE table blastnr.blastnr_best_hits_species_%s (taxon_id INT, hit_taxid INT, count INT, proportion FLOAT)' % biodb server.adaptor.execute(sql, ) server.commit() sql = 'select taxon_id, count(*) as n from biosqldb.orthology_detail_%s group by taxon_id' % ( biodb) taxon_id2n_CDS = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) for one_taxon in taxon_list: print one_taxon sql2 = 'select A.subject_taxid, count(*) from blastnr.blastnr_%s A ' \ ' where A.query_taxon_id=%s and hit_number=1 group by A.subject_taxid;' % (biodb, one_taxon) taxon_id2count = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2, )) #print seqfeature2best_hit_phylum total_bbh = float(sum([int(i) for i in taxon_id2count.values()])) for n, taxon in enumerate(taxon_id2count): sql = 'insert into blastnr.blastnr_best_hits_species_%s values(%s,%s,%s, %s)' % ( biodb, one_taxon, taxon, taxon_id2count[taxon], round((taxon_id2count[taxon] / total_bbh) * 100, 2)) server.adaptor.execute(sql, ) server.commit()
def get_interpro2go_table(): ''' :param interpro_release: par exemple: 60.0 :return: ''' import urllib2 import MySQLdb from chlamdb.biosqldb import manipulate_biosqldb import os sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="interpro") # name of the data base cursor = conn.cursor() sql = 'CREATE table interpro2gene_ontology (interpro_id INT, go_id INT, index interpro_id(interpro_id), index go_id(go_id))' #cursor.execute(sql,) #conn.commit() sql = 'select name,interpro_id from entry' cursor.execute(sql, ) interpro_name2interpro_id = manipulate_biosqldb.to_dict(cursor.fetchall()) sql = 'select acc,id from gene_ontology.term where acc like "GO%"' cursor.execute(sql, ) go_name2go_id = manipulate_biosqldb.to_dict(cursor.fetchall()) link = 'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro2go' req = urllib2.Request(link) entry_list = urllib2.urlopen(req) echec = 0 for i, line in enumerate(entry_list): if line[0] == '!': continue interpro_name = line.rstrip().split(':')[1].split(' ')[0] go_name = line.rstrip().split(' ; ')[1] try: sql = 'insert into interpro2gene_ontology values(%s, %s)' % ( interpro_name2interpro_id[interpro_name], go_name2go_id[go_name]) cursor.execute(sql, ) conn.commit() except: echec += 1 print(echec, i)
def seqfeature_id2n_species_chlamydiae_only(biodb, chlamydiae_taxon_list): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql0 = 'create table if not exists custom_tables.seqfeature_id2n_species_chlamydiae_%s (seqfeature_id INT primary KEY, n_species INT, INDEX n_species(n_species))' % biodb server.adaptor.execute(sql0, ) sql1 = 'select locus_tag, orthogroup from orthology_detail_%s' % biodb orthogroup2locus_list = {} for row in server.adaptor.execute_and_fetchall(sql1, ): if row[1] not in orthogroup2locus_list: orthogroup2locus_list[row[1]] = [row[0]] else: orthogroup2locus_list[row[1]].append(row[0]) sql2 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2, )) sql3 = 'select taxon_id, species_id from species_%s' % biodb taxon_id2species_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql3, )) sql4 = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb locus2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql4, )) for group in orthogroup2locus_list: species = [] locus_keep = [] for locus in orthogroup2locus_list[group]: taxon_id = locus2taxon_id[locus] if taxon_id in chlamydiae_taxon_list: locus_keep.append(locus) species_id = taxon_id2species_id[str(taxon_id)] if species_id not in species: species.append(species_id) for one_locus in locus_keep: seqfeature_id = locus_tag2seqfeature_id[one_locus] sql = 'insert into custom_tables.seqfeature_id2n_species_chlamydiae_%s values (%s, %s)' % ( biodb, seqfeature_id, len(species)) server.adaptor.execute(sql, ) server.commit()
def load_hmm_data(biodb, database_name, table_name, hmm_tab_files): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create table %s.%s (locus_tag varchar(400), taxon_id INT, hmm_accession varchar(400));' % (database_name, table_name) server.adaptor.execute_and_fetchall(sql,) sql = 'select locus_tag, taxon_id from biosqldb.orthology_detail_%s' % biodb locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) for hmm_file in hmm_tab_files: print hmm_file with open(hmm_file, 'r') as f: for row in f: if row[0] == '#': continue else: data = row.rstrip().split() locus_tag = data[0] hit_accession = data[3] print '%s\t%s\t%s' % (hit_accession, locus_tag, locus2taxon_id[locus_tag]) sql = 'insert into %s.%s values ("%s", %s, "%s")' % (database_name, table_name,locus_tag, locus2taxon_id[locus_tag], hit_accession) server.adaptor.execute(sql,) server.adaptor.commit()
def get_whole_db_interpro_taxonomy(): import MySQLdb from chlamdb.biosqldb import manipulate_biosqldb import os sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="interpro") # name of the data base cursor = conn.cursor() interpro_vesrion = get_interpro_version() create_interpro_taxonomy_table(interpro_vesrion) sql = 'select name,interpro_id from interpro.entry' cursor.execute(sql, ) interpro_accession2interpro_id = manipulate_biosqldb.to_dict( cursor.fetchall()) for i, interpro_accession in enumerate(interpro_accession2interpro_id): print '%s / %s -- %s' % (i, len(interpro_accession2interpro_id), interpro_accession) euk, bact, arch, virus = interpro_entry2taxonomy(interpro_accession) total = float(euk + bact + arch + virus) sql = 'insert into interpro_taxonomy_v_%s values (%s,%s,%s,%s,%s,%s,%s,%s, %s, %s)' % ( int(interpro_vesrion), interpro_accession2interpro_id[interpro_accession], euk, bact, arch, virus, total, round((euk / total) * 100, 2), round((bact / total) * 100, 2), round( (arch / total) * 100, 2), round((virus / total) * 100, 2)) cursor.execute(sql, ) conn.commit()
def load_sport_data(psortdb_output_files, db_name, hash2locus_list): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) hash2psort_results = {} for psortdb_output in psortdb_output_files: hash2psort_results.update(parse_psort_results(psortdb_output)) sql = f'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_{db_name}' locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) sql = f'create table if not exists custom_tables.seqfeature_id2psortb_{db_name} (seqfeature_id INT, final_prediction varchar(200), score FLOAT)' server.adaptor.execute(sql,) sql_template = f'insert into custom_tables.seqfeature_id2psortb_{db_name} values (%s, %s, %s)' for hash in hash2psort_results: for locus in hash2locus_list[hash]: final_pred = hash2psort_results[hash]['final_prediction'] if final_pred == 'Unknown': score = None else: score = hash2psort_results[hash][final_pred] server.adaptor.execute(sql_template, (locus_tag2seqfeature_id[locus], final_pred, score)) sql = f'create index sfp on custom_tables.seqfeature_id2psortb_{db_name}(seqfeature_id)' server.adaptor.execute(sql,) server.commit()
def locus2ec_table(hash2ec_dico, biodatabase, hash2locus_list): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables server, db = manipulate_biosqldb.load_db(biodatabase) sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.seqfeature_id2ec_%s (enzyme_id INT AUTO_INCREMENT PRIMARY KEY,' \ ' seqfeature_id INT,' \ ' ec_id INT,' \ ' FOREIGN KEY(ec_id) REFERENCES enzymes(enzyme_id));' % (biodatabase) server.adaptor.execute_and_fetchall(sql2, ) sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodatabase locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) for hash in hash2ec_dico: for ec_data in hash2ec_dico[hash]: for locus in hash2locus_list[hash]: sql = 'select enzyme_id from enzyme.enzymes where ec="%s"' % ec_data[ 0] ec_id = server.adaptor.execute_and_fetchall(sql, )[0][0] seqfeature_id = locus_tag2seqfeature_id[locus] sql = 'insert into enzyme.seqfeature_id2ec_%s (seqfeature_id, ec_id) values (%s, %s)' % ( biodatabase, seqfeature_id, ec_id) server.adaptor.execute(sql, ) server.commit()
def locus2ko_table(locus_tag2ko_dico, biodatabase, ko_accession2ko_id): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodatabase) sql2 = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodatabase locus2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2)) sql2 = 'CREATE TABLE IF NOT EXISTS enzyme.seqfeature_id2ko_%s (seqfeature_id INT,' \ ' ko_id INT, ' \ ' index ko_id (ko_id),' \ ' index seqid (seqfeature_id));' % (biodatabase) server.adaptor.execute_and_fetchall(sql2, ) for locus in locus_tag2ko_dico: ko = locus_tag2ko_dico[locus] ko_id = ko_accession2ko_id[ko] seqfeature_id = locus2seqfeature_id[locus] sql = 'insert into enzyme.seqfeature_id2ko_%s (seqfeature_id, ko_id) values (%s, %s)' % ( biodatabase, seqfeature_id, ko_id) server.adaptor.execute(sql, ) server.commit()
def load_hash2locus(hash2locus_list, biodb): import MySQLdb import os from chlamdb.biosqldb import manipulate_biosqldb sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw) # name of the data base cursor = conn.cursor() sql = 'select locus_tag, seqfeature_id from annotation.seqfeature_id2locus_%s' % biodb cursor.execute(sql, ) locus_tag2seqfeature_id = manipulate_biosqldb.to_dict(cursor.fetchall()) sql = 'create table annotation.hash2seqfeature_id_%s (hash varchar(300), seqfeature_id INTEGER)' % biodb cursor.execute(sql, ) for hash in hash2locus_list: locus_list = hash2locus_list[hash] for locus in locus_list: cursor.execute( 'insert into annotation.hash2seqfeature_id_%s values ("%s", "%s")' % (biodb, hash, locus_tag2seqfeature_id[locus])) conn.commit() sql1 = 'create index h1 ON annotation.hash2seqfeature_id_%s(hash)' % biodb sql2 = 'create index h2 ON annotation.hash2seqfeature_id_%s(seqfeature_id)' % biodb cursor.execute(sql1, ) cursor.execute(sql2, ) conn.commit()
def get_pairwise_connexions(accession_1, accession_2, biodb): from chlamdb.biosqldb import manipulate_biosqldb import numpy import pandas server, db = manipulate_biosqldb.load_db(biodb) sql1 = 'select seqfeature_id, start, stop from biosqldb.orthology_detail_%s where accession in ("%s","%s") ' % (biodb, accession_1, accession_2) seqfeature_id2location = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,)) print seqfeature_id2location.keys()[0:10] sql2 = 'select accession, taxon_id from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' where t1.name="%s"' % biodb print sql2 accession2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,)) comp1_sql = 'select locus_1,locus_2,identity from (select * from ' \ ' comparative_tables.identity_closest_homolog2_%s where taxon_1=%s and taxon_2=%s) A ' \ ' inner join biosqldb.orthology_detail_%s B on A.locus_1=B.seqfeature_id;' % (biodb, accession2taxon_id[accession_1], accession2taxon_id[accession_2], biodb) data = server.adaptor.execute_and_fetchall(comp1_sql,) comparison_table = [] for row in data: print row try: start1 = seqfeature_id2location[int(row[0])][0] stop1 = seqfeature_id2location[int(row[0])][0] start2 = seqfeature_id2location[int(row[1])][0] stop2 = seqfeature_id2location[int(row[1])][0] identity = row[2] comparison_table.append([start1, stop1, start2, stop2, identity]) except: pass data = numpy.array(comparison_table) columns = ['start1', 'end1', 'start2', 'end2', 'identity'] df = pandas.DataFrame(data, columns=columns) return df
def load_effectiveT3_table(table_file, biodb, hash2locus_list): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create database if not exists effectors;' server.adaptor.execute(sql, ) sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb locus_tag2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'CREATE table IF NOT EXISTS effectors.predicted_effectiveT3_%s (seqfeature_id INT primary key,taxon_id INT,score FLOAT,effectors INT,' \ ' INDEX taxon_id (taxon_id))' % biodb server.adaptor.execute(sql, ) with open(table_file, 'r') as f: for i, row in enumerate(f): if row[0] == '#': continue data = row.rstrip().split(';') if data[0] == 'Id': continue hash = data[0] if data[3] == 'false': effector = 0 continue elif data[3] == 'true': effector = 1 else: print('unknown result', data) for locus in hash2locus_list[hash]: # CRC-02125D79C6DFBC1D; RhT_01099 Rhabdochlamydia helvetica T3358;1.000000000000000;true sql = 'insert into effectors.predicted_effectiveT3_%s values (%s,%s,%s,%s)' % ( biodb, locus_tag2seqfeature_id[locus], locus_tag2taxon_id[locus], data[2], effector) server.adaptor.execute(sql, ) server.commit()
def biodb2cds_gc(biodb): from chlamdb.biosqldb import manipulate_biosqldb from Bio.SeqUtils import GC123 server, db = manipulate_biosqldb.load_db(biodb) sql1 = 'select distinct accession from orthology_detail_%s' % biodb sql2 = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb sql3 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb accession_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql1,)] locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,)) locus2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql3,)) sql_head = 'create table IF NOT EXISTS custom_tables.gc_content_%s (taxon_id INT, ' \ ' seqfeature_id INT,' \ ' seq_length INT, gc_percent FLOAT, gc_1 FLOAT, gc_2 FLOAT, gc_3 FLOAT, ' \ ' INDEX seqfeature_id(seqfeature_id), index taxon_id(taxon_id))' % biodb server.adaptor.execute(sql_head,) count_all=0 for accession in accession_list: print (accession) record = db.lookup(accession=accession) seq = record.seq for n, feature in enumerate(record.features): if feature.type == 'CDS' and not 'pseudo' in feature.qualifiers and not 'pseudogene' in feature.qualifiers and 'translation' in feature.qualifiers: count_all+=1 dna_sequence = feature.extract(seq) locus = feature.qualifiers['locus_tag'][0] gc, gc1, gc2, gc3 = GC123(str(dna_sequence)) sql = 'insert into custom_tables.gc_content_%s values (%s, %s, %s, %s, %s, %s, %s);' % (biodb, locus2taxon_id[locus], locus2seqfeature_id[locus], len(dna_sequence), round(gc,2), round(gc1), round(gc2), round(gc3)) server.adaptor.execute(sql,) server.commit()
def import_silix(silix_output, biodb, silix_cutoff): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) import re # columns # 0 taxon_id # 1 seqfeature_id # 2 silix_family_id sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb locus2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) sql = 'select locus_tag, taxon_id from biosqldb.orthology_detail_%s' % biodb locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) sql = 'create table IF NOT EXISTS comparative_tables.silix_%s_%s (taxon_id INT, ' \ ' seqfeature_id INT, ' \ ' silix_id INT,' \ ' INDEX seqfeature_id (seqfeature_id), index taxon_id(taxon_id), index silix_id(silix_id));' % (biodb, silix_cutoff) server.adaptor.execute(sql,) server.commit() with open(silix_output, 'r') as f: row_list = [i for i in f] family_list = list(set([i.rstrip().split('\t')[0] for i in row_list])) for n, row in enumerate(row_list): if n == 0: continue data = row.rstrip().split('\t') family_index = family_list.index(data[0]) locus_tag = data[1] taxon_id = locus2taxon_id[locus_tag] seqfeature_id = locus2seqfeature_id[locus_tag] print family_index, locus_tag sql = 'insert into comparative_tables.silix_%s_%s values (%s, %s, %s);' % (biodb, silix_cutoff, taxon_id, seqfeature_id, family_index) server.adaptor.execute(sql,) server.commit()
def load_T3_MM_table(table_file, biodb, hash2locus_list): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create database if not exists effectors;' server.adaptor.execute(sql, ) sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb locus_tag2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'CREATE table effectors.predicted_T3MM_%s (seqfeature_id INT primary key,taxon_id INT,value FLOAT,effectors INT,' \ ' probability FLOAT, INDEX taxon_id (taxon_id))' % biodb server.adaptor.execute(sql, ) with open(table_file, 'r') as f: for row in f: data = row.rstrip().split(',') if data[0] == 'seqName': continue else: hash = data[0] if data[2] == 'NO': effector = 0 continue elif data[2] == 'YES': effector = 1 else: print('unknown result', data) for locus in hash2locus_list[hash]: # seqName,value,T3SE,probability sql = 'insert into effectors.predicted_T3MM_%s values (%s,%s,%s,%s, %s)' % ( biodb, locus_tag2seqfeature_id[locus], locus_tag2taxon_id[locus], data[1], effector, data[3]) server.adaptor.execute(sql, ) server.commit()
def load_chaperones_table(table_file, biodb='chlamydia_04_16'): from chlamdb.biosqldb import manipulate_biosqldb import re ''' ATTENTION possible d'avoir plusieurs sequences cibles identifiees dans une meme sequence: dubplicate seqfeature dans la table ''' server, db = manipulate_biosqldb.load_db(biodb) sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb locus_tag2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'CREATE table IF NOT EXISTS effectors.predicted_chaperones_%s (seqfeature_id INT,taxon_id INT,motif varchar(400), info TEXT,' \ ' INDEX taxon_id (taxon_id), index seqfeature_id (seqfeature_id))' % biodb server.adaptor.execute(sql, ) with open(table_file, 'r') as f: for i, row in enumerate(f): data = row.rstrip().split('\t') if data[1] == 'Protein does not contain a CCBD within the first 150 amino acids': continue else: motif = data[1].split('(')[1].split(')')[0] info = data[1].split(') ')[1] #Protein,SVM-Value,T4S protein or not #locus = re.sub('"', "", data[0]) sql = 'insert into effectors.predicted_chaperones_%s values (%s,%s,"%s", "%s")' % ( biodb, locus_tag2seqfeature_id[data[0]], locus_tag2taxon_id[data[0]], motif, info) try: server.adaptor.execute(sql, ) server.commit() except: print data import sys sys.exit()
def update_analysis_dico(server): from chlamdb.biosqldb import manipulate_biosqldb sql = 'select analysis_name, analysis_id from interpro.analysis' analysis_nam2analysis_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) return analysis_nam2analysis_id
def load_DeepT3_table(table_file, biodb, hash2locus_list): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'create database if not exists effectors;' server.adaptor.execute(sql, ) sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb locus_tag2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'CREATE table IF NOT EXISTS effectors.predicted_DeepT3_%s (seqfeature_id INT primary key,taxon_id INT,effectors INT,' \ ' INDEX taxon_id (taxon_id))' % biodb server.adaptor.execute(sql, ) with open(table_file, 'r') as f: for i, row in enumerate(f): data = row.rstrip().split('\t') if i < 3: continue else: hash = data[0] if data[1] == 'non-T3SE': effector = 0 continue elif data[1] == 'T3SE': effector = 1 else: print('unknown result', data) for locus in hash2locus_list[hash]: # CRC-20ACA5744563D026 non-T3SE sql = 'insert into effectors.predicted_DeepT3_%s values (%s,%s,%s)' % ( biodb, locus_tag2seqfeature_id[locus], locus_tag2taxon_id[locus], effector) server.adaptor.execute(sql, ) server.commit()
def __init__(self, string_sqlite, blast_results, db_name, hash2locus_list, query_fasta_file, db_fasta_file): self.string_conn = sqlite3.connect(string_sqlite) self.string_cursor = self.string_conn.cursor() self.server, self.db = manipulate_biosqldb.load_db(db_name) self.blast_results = blast_results self.hash2locus_list = hash2locus_list self.db_name = db_name self.query2len = self.parse_fasta(query_fasta_file) self.hit2len = self.parse_fasta(db_fasta_file) self.missing_pmid_data = [] # create tables if not exists self.create_tables() # retrieve data from string db sql = 'select pmid,publication_date,publication_source,linkout_url,authors,title from publications;' sql_species = 'select species_id,compact_name from species' self.pmid2article_data = manipulate_biosqldb.to_dict( self.string_cursor.execute(sql, ).fetchall()) self.species_id2species_name = manipulate_biosqldb.to_dict( self.string_cursor.execute(sql_species).fetchall()) # retrieve protein hash from biosqldb sql = f'select distinct hash from string.seqfeature_id2string_protein_mapping t1 inner join annotation.hash2seqfeature_id_{db_name} t2 on t1.seqfeature_id=t2.seqfeature_id;' self.hash_in_db = set( [i[0] for i in self.server.adaptor.execute_and_fetchall(sql, )]) # check if some data were already inserted sql = 'select pmid from string.pmid2data_stringdb' self.pmid_in_db = [ i[0] for i in self.server.adaptor.execute_and_fetchall(sql, ) ] sql = 'select accession,id from string.string_protein_entry' self.string_proteins2string_protein_id = manipulate_biosqldb.to_dict( self.server.adaptor.execute_and_fetchall(sql, ))
def load_T4SEpre_psAac_table(table_file, biodb='chlamydia_04_16'): from chlamdb.biosqldb import manipulate_biosqldb import re server, db = manipulate_biosqldb.load_db(biodb) sql = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb locus_tag2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'CREATE table IF NOT EXISTS effectors.predicted_T4SEpre_psAac_%s (seqfeature_id INT primary key,taxon_id INT,SVM_value FLOAT,effectors INT,' \ ' INDEX taxon_id (taxon_id))' % biodb server.adaptor.execute(sql, ) with open(table_file, 'r') as f: for i, row in enumerate(f): data = row.rstrip().split(',') if i < 2: continue else: if data[2] == 'NO': effector = 0 continue elif data[2] == 'YES': effector = 1 else: print 'unknown result', data #Protein,SVM-Value,T4S protein or not #locus = re.sub('"', "", data[0]) sql = 'insert into effectors.predicted_T4SEpre_psAac_%s values (%s,%s,%s,%s)' % ( biodb, locus_tag2seqfeature_id[data[0]], locus_tag2taxon_id[data[0]], data[1], effector) server.adaptor.execute(sql, ) server.commit()
def blastswiss2biosql( locus_tag2seqfeature_id, db_name, n_procs, mysql_host, mysql_user, mysql_pwd, mysql_db, hash2locus_list, *input_blast_files): import numpy from multiprocessing import Process create_sql_blast_swissprot_tables(db_name, mysql_host, mysql_user, mysql_pwd,mysql_db) print ('get locus2taxon_id') sql = 'select locus_tag, taxon_id from orthology_detail_%s' % db_name locus_tag2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) print ('get locus2bioentry') sql2 = 'select locus_tag,bioentry_id from biodatabase t1 ' \ ' inner join bioentry as t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' inner join orthology_detail_%s t3 on t2.accession=t3.accession where t1.name="%s"' % (db_name,db_name) locus_tag2bioentry_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,)) load_blastswissprot_file_into_db(locus_tag2taxon_id, locus_tag2seqfeature_id, locus_tag2bioentry_id, mysql_host, mysql_user, mysql_pwd, mysql_db, input_blast_files, biodb, hash2locus_list) sys.stdout.write("done!")
def parse_orthofinder_blast_files(biodb, orthofinder_blast_file_list, locus_tag2orthofinder_id, evalue_cutoff=0.01, identity_cutoff=0, sqlite3=False): from chlamdb.biosqldb import manipulate_biosqldb if not sqlite3: server, db = manipulate_biosqldb.load_db(biodb) else: server, db = manipulate_biosqldb.load_db(biodb, sqlite3) sql = 'select locus_tag, taxon_id from annotation.seqfeature_id2locus_%s' % biodb locus_tag2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) locus2taxon2best_hit_id = {} for n, file in enumerate(orthofinder_blast_file_list): print(n, len(orthofinder_blast_file_list)) with open(file, 'r') as f: for line in f: #9_1 60_294 56.647 346 139 3 4 338 5 350 3.33e-141 405 data = line.split('\t') seq_1_id = data[0] seq_2_id = data[1] locus_tag_1 = locus_tag2orthofinder_id[seq_1_id] locus_tag_2 = locus_tag2orthofinder_id[seq_2_id] identity = data[2] evalue = data[10] bitscore = data[11] query_align_length = int(data[7]) - int(data[6]) hit_align_length = int(data[9]) - int(data[8]) taxon_id_seq_2 = locus_tag2taxon_id[locus_tag_2] if locus_tag_1 not in locus2taxon2best_hit_id: locus2taxon2best_hit_id[locus_tag_1] = {} locus2taxon2best_hit_id[locus_tag_1][taxon_id_seq_2] = [ locus_tag_2, identity, evalue, bitscore, query_align_length, hit_align_length ] else: # not the best hit, skip if taxon_id_seq_2 in locus2taxon2best_hit_id[locus_tag_1]: continue # best hit else: locus2taxon2best_hit_id[locus_tag_1][ taxon_id_seq_2] = [ locus_tag_2, identity, evalue, bitscore, query_align_length, hit_align_length ] return locus2taxon2best_hit_id
def identity_closest_homolog(db_name): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables import sys server, db = manipulate_biosqldb.load_db(db_name) sql1 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % db_name locus2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql1, )) sql2 = "CREATE TABLE comparative_tables.identity_closest_homolog2_%s(taxon_1 INT NOT NULL," \ " taxon_2 INT NOT NULL," \ " locus_1 INT NOT NULL," \ " locus_2 INT NOT NULL," \ " identity FLOAT, index locus_1(locus_1)," \ " index locus_2(locus_2), index taxon_1(taxon_1), index taxon_2(taxon_2))" % (db_name) server.adaptor.execute(sql2) #identitydico = biosql_own_sql_tables.calculate_average_protein_identity_new_tables(db_name) taxon2description = manipulate_biosqldb.taxon_id2genome_description( server, biodatabase_name=db_name) all_taxons = taxon2description.keys() for i, taxon_1 in enumerate(all_taxons): locus2identity = biosql_own_sql_tables.circos_locus2taxon_highest_identity( db_name, taxon_1) for taxon_2 in all_taxons: if taxon_1 == taxon_2: continue for locus in locus2identity: try: #print taxon_1, taxon_2, locus, locus2identity[locus][long(taxon_2)][1], locus2identity[locus][long(taxon_2)][0] #sys.stdout.write("%s\t%s\n" % (taxon_1, taxon_2)) sql = 'insert into comparative_tables.identity_closest_homolog2_%s(taxon_1, taxon_2, locus_1, locus_2, identity) ' \ ' VALUES ("%s", "%s", "%s", "%s", %s)' % (db_name, taxon_1, taxon_2, locus2seqfeature_id[locus], locus2seqfeature_id[locus2identity[locus][int(taxon_2)][1]], locus2identity[locus][int(taxon_2)][0]) server.adaptor.execute(sql) except KeyError: # no homologs continue server.adaptor.commit()
def get_pathway_ko_association_table(): import os import MySQLdb from chlamdb.biosqldb import manipulate_biosqldb import urllib from Bio.KEGG.KGML import KGML_parser import re sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="enzyme") # name of the data base cursor = conn.cursor() sql = 'create table enzyme.pathway2ortholog_associations (pathway_id INT, node_id INT, ko_id varchar(200), ' \ ' index pathway_id(pathway_id), index node_id(node_id), index ko_id(ko_id));' cursor.execute(sql, ) conn.commit() sql2 = 'select pathway_name,pathway_id from enzyme.kegg_pathway' cursor.execute(sql2, ) pathway2pathway_id = manipulate_biosqldb.to_dict(cursor.fetchall()) for pathway in pathway2pathway_id: print(pathway) url_template = 'http://rest.kegg.jp/get/%s/kgml' % re.sub( 'map', 'ko', pathway) print(url_template) try: f = urllib.request.urlopen(url_template) except: continue from Bio.Graphics import KGML_vis pathway_KGML = KGML_parser.read(f.read().decode("UTF-8")) # Loop over the orthologs in the pathway, and change the # background colour orthologs = [e for e in pathway_KGML.orthologs] for o in orthologs: ko_temp_list = list(set([i.rstrip() for i in o.name.split('ko:')])) ko_temp_list = filter(None, ko_temp_list) for ko in ko_temp_list: sql = 'insert into enzyme.pathway2ortholog_associations values(%s, %s, "%s")' % ( pathway2pathway_id[pathway], o.id, ko) cursor.execute(sql, ) conn.commit()