def get_mysql_table(db_name, table_name): import numpy as np server, db = manipulate_biosqldb.load_db(db_name) all_taxons_id = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_taxons = "id, " for i in range(0, len(all_taxons_id) - 1): sql_taxons += ' `%s`,' % all_taxons_id[i] sql_taxons += ' `%s`' % all_taxons_id[-1] sql = "select %s from comparative_tables.%s_%s" % (sql_taxons, table_name, db_name) mat = np.array(server.adaptor.execute_and_fetchall(sql, )) f = open("%s_matrix.tab" % table_name, "w") taxonid2genome = manipulate_biosqldb.taxon_id2genome_description( server, db_name, True) taxons_ids = [taxonid2genome[int(i)] for i in all_taxons_id] f.write('"id"\t"' + '"\t"'.join(taxons_ids) + '"\n') for row in mat: row = [str(i) for i in row] f.write("\t".join(row) + "\n")
def locus_list2identity_in_other_genomes(locus_list, biodb): server, db = manipulate_biosqldb.load_db(biodb) locus_tag2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, biodb) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, biodb) import re for i in taxon_id2description.keys(): taxon_id2description[i] = re.sub(" subsp\. aureus", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("strain ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" chromosome", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("Staphylococcus aureus ", "", taxon_id2description[i]) header = 'orthogroup\t' dico = locus_tag2identity_best_hit_all_genomes(biodb, 'wcw_1594', 'group_417') for i in dico.keys(): header += taxon_id2description[i] + '\t' final_out = header + '\n' for locus in locus_list: #print "locus", i seqfeature_id = locus_tag2seqfeature_id[locus] orthogroup = manipulate_biosqldb.seqfeature_id2orthogroup( server, seqfeature_id, biodb) #print "ortho", orthogroup dico = locus_tag2identity_best_hit_all_genomes(biodb, locus, orthogroup) #print "dico done..." out = '%s\t' % orthogroup for i in dico.keys(): identity = dico[i] out += '%s\t' % identity final_out += out + '\n' return final_out
def identity_closest_homolog(db_name): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables import sys server, db = manipulate_biosqldb.load_db(db_name) sql1 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % db_name locus2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql1, )) sql2 = "CREATE TABLE comparative_tables.identity_closest_homolog2_%s(taxon_1 INT NOT NULL," \ " taxon_2 INT NOT NULL," \ " locus_1 INT NOT NULL," \ " locus_2 INT NOT NULL," \ " identity FLOAT, index locus_1(locus_1)," \ " index locus_2(locus_2), index taxon_1(taxon_1), index taxon_2(taxon_2))" % (db_name) server.adaptor.execute(sql2) #identitydico = biosql_own_sql_tables.calculate_average_protein_identity_new_tables(db_name) taxon2description = manipulate_biosqldb.taxon_id2genome_description( server, biodatabase_name=db_name) all_taxons = taxon2description.keys() for i, taxon_1 in enumerate(all_taxons): locus2identity = biosql_own_sql_tables.circos_locus2taxon_highest_identity( db_name, taxon_1) for taxon_2 in all_taxons: if taxon_1 == taxon_2: continue for locus in locus2identity: try: #print taxon_1, taxon_2, locus, locus2identity[locus][long(taxon_2)][1], locus2identity[locus][long(taxon_2)][0] #sys.stdout.write("%s\t%s\n" % (taxon_1, taxon_2)) sql = 'insert into comparative_tables.identity_closest_homolog2_%s(taxon_1, taxon_2, locus_1, locus_2, identity) ' \ ' VALUES ("%s", "%s", "%s", "%s", %s)' % (db_name, taxon_1, taxon_2, locus2seqfeature_id[locus], locus2seqfeature_id[locus2identity[locus][int(taxon_2)][1]], locus2identity[locus][int(taxon_2)][0]) server.adaptor.execute(sql) except KeyError: # no homologs continue server.adaptor.commit()
def locus_list2presence_absence_all_genomes(locus_list, biodb_name): server, db = manipulate_biosqldb.load_db(biodb_name) locus_tag2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, biodb_name) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, biodb_name) import re for i in taxon_id2description.keys(): taxon_id2description[i] = re.sub(" subsp\. aureus", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("strain ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" chromosome", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("Staphylococcus aureus ", "", taxon_id2description[i]) header = 'orthogroup\t' genomes = manipulate_biosqldb.get_genome_taxons_list(server, biodb_name) for i in genomes: header += taxon_id2description[i] + '\t' final_out = header + '\n' for i in locus_list: #print "locus", i seqfeature_id = locus_tag2seqfeature_id[i] orthogroup = manipulate_biosqldb.seqfeature_id2orthogroup( server, seqfeature_id, biodb_name) #print "ortho", orthogroup dico = heatmap_presence_absence(biodb_name, orthogroup) #print "dico done..." #print dico out = '%s\t' % orthogroup for i in genomes: out += '%s\t' % dico[i] final_out += out + '\n' return final_out
def biodb2randomized_matrix(bio_db_name): server, db = manipulate_biosqldb.load_db(bio_db_name) matrix = np.array( manipulate_biosqldb.get_orthology_table(server, bio_db_name)) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, bio_db_name) #print taxon_id2description #group_names = matrix[:,0] taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name) print 'Number of taxons:', len(taxons_ids) taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids] import re for i, accession in enumerate(taxons_ids): #print i, accession description = taxons_ids[i] description = re.sub(", complete genome\.", "", description) description = re.sub(", complete genome", "", description) description = re.sub(", complete sequence\.", "", description) description = re.sub("strain ", "", description) description = re.sub("str\. ", "", description) description = re.sub(" complete genome sequence\.", "", description) description = re.sub(" complete genome\.", "", description) description = re.sub(" chromosome", "", description) description = re.sub(" DNA", "S.", description) description = re.sub("Merged record from ", "", description) description = re.sub(", wgs", "", description) description = re.sub("Candidatus ", "", description) description = re.sub(".contig.0_1, whole genome shotgun sequence.", "", description) description = re.sub("Protochlamydia", "P.", description) description = re.sub("Chlamydia", "C.", description) description = re.sub("Chlamydophila", "E.", description) description = re.sub("Estrella", "E.", description) description = re.sub("Rhodopirellula", "R.", description) description = re.sub("Methylacidiphilum", "M.", description) description = re.sub(" phage", "", description) description = re.sub("Parachlamydia", "P.", description) description = re.sub("Neochlamydia", "Neo.", description) description = re.sub("Simkania", "S.", description) description = re.sub("Waddlia", "W.", description) description = re.sub("Pirellula", "P.", description) description = re.sub("Rhabdochlamydiaceae sp.", "Rhabdo", description) taxons_ids[i] = description M = matrix.astype(float) # [:, 1:] M = heatmap.randomize_table(M) return (M, taxons_ids)
def convert_tree_taxon2genome(biodb_name, input_tree, output_tree, sqlite=False): server, db = manipulate_biosqldb.load_db(biodb_name, sqlite=sqlite) print biodb_name taxon_id2genome_description = manipulate_biosqldb.taxon_id2genome_description( server, biodb_name) print taxon_id2genome_description #locus2genome = manipulate_biosqldb.locus_tag2genome_name(server, biodb_name) import re for i in taxon_id2genome_description.keys(): print i taxon_id2genome_description[i] = re.sub(" subsp\. aureus", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(", complete genome\.", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(", complete sequence\.", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub("strain ", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub("str\. ", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(" complete genome sequence\.", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(" complete genome\.", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(" chromosome", "", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub("Staphylococcus", "S.", taxon_id2genome_description[i]) taxon_id2genome_description[i] = re.sub(" DNA", "S.", taxon_id2genome_description[i]) #print taxon_id2genome_description[i] print taxon_id2genome_description new_tree = parse_newick_tree.convert_terminal_node_names( input_tree, taxon_id2genome_description) #print new_tree[0] print "writing converted tree..." print output_tree Phylo.write(new_tree, output_tree, 'newick')
def write_ortho_matrix(bio_db_name): server, db = manipulate_biosqldb.load_db(bio_db_name) matrix = np.array( manipulate_biosqldb.get_orthology_table(server, bio_db_name)) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, bio_db_name) group_names = matrix[:, 0] taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name) import re for i in taxon_id2description.keys(): taxon_id2description[i] = re.sub(" subsp\. aureus", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("strain ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" chromosome", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" DNA", "", taxon_id2description[i]) taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids] #print taxon_id2description f = open("ortho_matrix.tab", "w") f.write("orthogroup\t" + "\t".join(taxons_ids) + "\n") for row in matrix: f.write("\t".join(row) + "\n") f.close()
def shared_orthogroups_average_identity(db_name): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import biosql_own_sql_tables import sys import numpy server, db = manipulate_biosqldb.load_db(db_name) sql = "CREATE TABLE comparative_tables.shared_og_av_id_%s(taxon_1 INT NOT NULL," \ " taxon_2 INT NOT NULL," \ " average_identity FLOAT," \ " median_identity FLOAT," \ " n_pairs INT)" % (db_name) server.adaptor.execute(sql) taxon2description = manipulate_biosqldb.taxon_id2genome_description( server, biodatabase_name=db_name) all_taxons = list(taxon2description.keys()) for i, taxon_1 in enumerate(all_taxons): for taxon_2 in all_taxons[i + 1:]: data_sql = 'select identity from comparative_tables.identity_closest_homolog2_%s where taxon_1=%s and taxon_2=%s' % ( db_name, taxon_1, taxon_2) data = list([ i[0] for i in server.adaptor.execute_and_fetchall(data_sql, ) ]) print(data) sql = 'insert into comparative_tables.shared_og_av_id_%s(taxon_1, taxon_2, average_identity,' \ ' median_identity, n_pairs) values (%s, %s, %s, %s, %s)' % (db_name, taxon_1, taxon_2, numpy.average(data), numpy.median(data), len(data)) print(sql) server.adaptor.execute_and_fetchall(sql, ) server.adaptor.commit()
def plot_tree_stacked_barplot( tree_file, taxon2value_list_barplot=False, header_list=False, # header stackedbarplots taxon2set2value_heatmap=False, taxon2label=False, header_list2=False, # header counts columns biodb=False, column_scale=True, general_max=False, header_list3=False, set2taxon2value_list_simple_barplot=False, set2taxon2value_list_simple_barplot_counts=True, rotate=False, taxon2description=False): ''' taxon2value_list_barplot list of lists: [[bar1_part1, bar1_part2,...],[bar2_part1, bar2_part2]] valeures de chaque liste transformes en pourcentages :param tree_file: :param taxon2value_list: :param biodb: :param exclude_outgroup: :param bw_scale: :return: ''' if biodb: from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) taxon2description = manipulate_biosqldb.taxon_id2genome_description( server, biodb, filter_names=True) t1 = Tree(tree_file) # Calculate the midpoint node R = t1.get_midpoint_outgroup() # and set it as tree outgroup t1.set_outgroup(R) colors2 = [ "red", "#FFFF00", "#58FA58", "#819FF7", "#F781F3", "#2E2E2E", "#F7F8E0", 'black' ] colors = [ "#7fc97f", "#386cb0", "#fdc086", "#ffffb3", "#fdb462", "#f0027f", "#F7F8E0", 'black' ] # fdc086ff 386cb0ff f0027fff tss = TreeStyle() tss.draw_guiding_lines = True tss.guiding_lines_color = "gray" tss.show_leaf_name = False if column_scale and header_list2: import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl column2scale = {} col_n = 0 for column in header_list2: values = taxon2set2value_heatmap[column].values() #print values if min(values) == max(values): min_val = 0 max_val = 1.5 * max(values) else: min_val = min(values) max_val = max(values) #print 'min-max', min_val, max_val norm = mpl.colors.Normalize(vmin=min_val, vmax=max_val) # *1.1 if col_n < 4: cmap = cm.OrRd # else: cmap = cm.YlGnBu #PuBu#OrRd m = cm.ScalarMappable(norm=norm, cmap=cmap) column2scale[column] = [m, float(max_val)] # *0.7 col_n += 1 for i, lf in enumerate(t1.iter_leaves()): #if taxon2description[lf.name] == 'Pirellula staleyi DSM 6068': # lf.name = 'Pirellula staleyi DSM 6068' # continue if i == 0: if taxon2label: n = TextFace(' ') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.hz_align = 2 n.vt_align = 2 n.rotation = 270 n.inner_background.color = "white" n.opacity = 1. tss.aligned_header.add_face(n, 0) col_add = 1 else: col_add = 1 if header_list: for col, header in enumerate(header_list): n = TextFace('%s' % (header)) n.margin_top = 0 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.rotation = 270 n.hz_align = 2 n.vt_align = 2 n.inner_background.color = "white" n.opacity = 1. tss.aligned_header.add_face(n, col + col_add) col_add += col + 1 if header_list3: #print 'header_list 3!' col_tmp = 0 for header in header_list3: n = TextFace('%s' % (header)) n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.rotation = 270 n.hz_align = 2 n.vt_align = 2 n.inner_background.color = "white" n.opacity = 1. if set2taxon2value_list_simple_barplot_counts: if col_tmp == 0: col_tmp += 1 tss.aligned_header.add_face(n, col_tmp + 1 + col_add) n = TextFace(' ') tss.aligned_header.add_face(n, col_tmp + col_add) col_tmp += 2 else: tss.aligned_header.add_face(n, col_tmp + col_add) col_tmp += 1 if set2taxon2value_list_simple_barplot_counts: col_add += col_tmp else: col_add += col_tmp if header_list2: for col, header in enumerate(header_list2): n = TextFace('%s' % (header)) n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.rotation = 270 n.hz_align = 2 n.vt_align = 2 n.inner_background.color = "white" n.opacity = 1. tss.aligned_header.add_face(n, col + col_add) col_add += col + 1 if taxon2label: try: n = TextFace('%s' % taxon2label[lf.name]) except: try: n = TextFace('%s' % taxon2label[int(lf.name)]) except: n = TextFace('-') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. if rotate: n.rotation = 270 lf.add_face(n, 1, position="aligned") col_add = 2 else: col_add = 2 if taxon2value_list_barplot: try: val_list_of_lists = taxon2value_list_barplot[lf.name] except: val_list_of_lists = taxon2value_list_barplot[int(lf.name)] #col_count = 0 for col, value_list in enumerate(val_list_of_lists): total = float(sum(value_list)) percentages = [(i / total) * 100 for i in value_list] if col % 3 == 0: col_list = colors2 else: col_list = colors b = StackedBarFace(percentages, width=150, height=18, colors=col_list[0:len(percentages)]) b.rotation = 0 b.inner_border.color = "white" b.inner_border.width = 0 b.margin_right = 5 b.margin_left = 5 if rotate: b.rotation = 270 lf.add_face(b, col + col_add, position="aligned") #col_count+=1 col_add += col + 1 if set2taxon2value_list_simple_barplot: col_list = [ '#fc8d59', '#91bfdb', '#99d594', '#c51b7d', '#f1a340', '#999999' ] color_i = 0 col = 0 for one_set in header_list3: if color_i > 5: color_i = 0 color = col_list[color_i] color_i += 1 # values for all taxons values_lists = [ float(i) for i in set2taxon2value_list_simple_barplot[one_set].values() ] #print values_lists #print one_set value = set2taxon2value_list_simple_barplot[one_set][lf.name] if set2taxon2value_list_simple_barplot_counts: if isinstance(value, float): a = TextFace(" %s " % str(round(value, 2))) else: a = TextFace(" %s " % str(value)) a.margin_top = 1 a.margin_right = 2 a.margin_left = 5 a.margin_bottom = 1 if rotate: a.rotation = 270 lf.add_face(a, col + col_add, position="aligned") #print 'value and max', value, max(values_lists) fraction_biggest = (float(value) / max(values_lists)) * 100 fraction_rest = 100 - fraction_biggest #print 'fractions', fraction_biggest, fraction_rest b = StackedBarFace([fraction_biggest, fraction_rest], width=100, height=15, colors=[color, 'white']) b.rotation = 0 b.inner_border.color = "grey" b.inner_border.width = 0 b.margin_right = 15 b.margin_left = 0 if rotate: b.rotation = 270 if set2taxon2value_list_simple_barplot_counts: if col == 0: col += 1 lf.add_face(b, col + 1 + col_add, position="aligned") col += 2 else: lf.add_face(b, col + col_add, position="aligned") col += 1 if set2taxon2value_list_simple_barplot_counts: col_add += col else: col_add += col if taxon2set2value_heatmap: i = 0 #if not taxon2label: # col_add-=1 for col2, head in enumerate(header_list2): col_name = header_list2[i] try: value = taxon2set2value_heatmap[col_name][str(lf.name)] except: try: value = taxon2set2value_heatmap[col_name][round( float(lf.name), 2)] except: value = 0 if header_list2[i] == 'duplicates': print('dupli', lf.name, value) #print 'val----------------', value if int(value) > 0: if int(value) >= 10 and int(value) < 100: n = TextFace('%4i' % value) elif int(value) >= 100: n = TextFace('%3i' % value) else: n = TextFace('%5i' % value) n.margin_top = 1 n.margin_right = 2 n.margin_left = 5 n.margin_bottom = 1 n.hz_align = 1 n.vt_align = 1 if rotate: n.rotation = 270 n.inner_background.color = rgb2hex( column2scale[col_name][0].to_rgba( float(value))) #"orange" #print 'xaxaxaxaxa', value, if float(value) > column2scale[col_name][1]: n.fgcolor = 'white' n.opacity = 1. n.hz_align = 1 n.vt_align = 1 lf.add_face(n, col2 + col_add, position="aligned") i += 1 else: n = TextFace('') n.margin_top = 1 n.margin_right = 1 n.margin_left = 5 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. if rotate: n.rotation = 270 lf.add_face(n, col2 + col_add, position="aligned") i += 1 #lf.name = taxon2description[lf.name] n = TextFace(taxon2description[lf.name], fgcolor="black", fsize=12, fstyle='italic') lf.add_face(n, 0) for n in t1.traverse(): nstyle = NodeStyle() if n.support < 1: nstyle["fgcolor"] = "black" nstyle["size"] = 6 n.set_style(nstyle) else: nstyle["fgcolor"] = "red" nstyle["size"] = 0 n.set_style(nstyle) return t1, tss
def plot_heat_tree(tree_file, biodb="chlamydia_04_16", exclude_outgroup=False, bw_scale=True): from chlamdb.biosqldb import manipulate_biosqldb import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl server, db = manipulate_biosqldb.load_db(biodb) sql_biodatabase_id = 'select biodatabase_id from biodatabase where name="%s"' % biodb db_id = server.adaptor.execute_and_fetchall(sql_biodatabase_id, )[0][0] if type(tree_file) == str: t1 = Tree(tree_file) try: R = t1.get_midpoint_outgroup() #print 'root', R # and set it as tree outgroup t1.set_outgroup(R) except: pass elif isinstance(tree_file, Tree): t1 = tree_file else: IOError('Unkown tree format') tss = TreeStyle() tss.draw_guiding_lines = True tss.guiding_lines_color = "gray" tss.show_leaf_name = False #print "tree", t1 sql1 = 'select taxon_id, description from bioentry where biodatabase_id=%s and description not like "%%%%plasmid%%%%"' % db_id sql2 = 'select t2.taxon_id, t1.GC from genomes_info_%s as t1 inner join bioentry as t2 ' \ ' on t1.accession=t2.accession where t2.biodatabase_id=%s and t1.description not like "%%%%plasmid%%%%";' % (biodb, db_id) sql3 = 'select t2.taxon_id, t1.genome_size from genomes_info_%s as t1 ' \ ' inner join bioentry as t2 on t1.accession=t2.accession ' \ ' where t2.biodatabase_id=%s and t1.description not like "%%%%plasmid%%%%";' % (biodb, db_id) sql4 = 'select t2.taxon_id,percent_non_coding from genomes_info_%s as t1 ' \ ' inner join bioentry as t2 on t1.accession=t2.accession ' \ ' where t2.biodatabase_id=%s and t1.description not like "%%%%plasmid%%%%";' % (biodb, db_id) sql_checkm_completeness = 'select taxon_id, completeness from custom_tables.checkm_%s;' % biodb sql_checkm_contamination = 'select taxon_id,contamination from custom_tables.checkm_%s;' % biodb try: taxon_id2completeness = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql_checkm_completeness)) taxon_id2contamination = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql_checkm_contamination)) except: taxon_id2completeness = False #taxon2description = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,)) taxon2description = manipulate_biosqldb.taxon_id2genome_description( server, biodb, filter_names=True) taxon2gc = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2, )) taxon2genome_size = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql3, )) taxon2coding_density = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql4, )) my_taxons = [lf.name for lf in t1.iter_leaves()] # Calculate the midpoint node if exclude_outgroup: excluded = str(list(t1.iter_leaves())[0].name) my_taxons.pop(my_taxons.index(excluded)) genome_sizes = [float(taxon2genome_size[i]) for i in my_taxons] gc_list = [float(taxon2gc[i]) for i in my_taxons] fraction_list = [float(taxon2coding_density[i]) for i in my_taxons] value = 1 max_genome_size = max(genome_sizes) #3424182# max_gc = max(gc_list) #48.23 cmap = cm.YlGnBu #YlOrRd#OrRd norm = mpl.colors.Normalize(vmin=min(genome_sizes) - 100000, vmax=max(genome_sizes)) m1 = cm.ScalarMappable(norm=norm, cmap=cmap) norm = mpl.colors.Normalize(vmin=min(gc_list), vmax=max(gc_list)) m2 = cm.ScalarMappable(norm=norm, cmap=cmap) norm = mpl.colors.Normalize(vmin=min(fraction_list), vmax=max(fraction_list)) m3 = cm.ScalarMappable(norm=norm, cmap=cmap) for i, lf in enumerate(t1.iter_leaves()): #if taxon2description[lf.name] == 'Pirellula staleyi DSM 6068': # lf.name = 'Pirellula staleyi DSM 6068' # continue if i == 0: n = TextFace('Size (Mbp)') n.rotation = -25 n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. #lf.add_face(n, 3, position="aligned") tss.aligned_header.add_face(n, 3) n = TextFace('GC (%)') n.rotation = -25 n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. #lf.add_face(n, 5, position="aligned") tss.aligned_header.add_face(n, 5) n = TextFace('') #lf.add_face(n, 2, position="aligned") tss.aligned_header.add_face(n, 2) #lf.add_face(n, 4, position="aligned") tss.aligned_header.add_face(n, 4) n = TextFace('Non coding (%)') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") tss.aligned_header.add_face(n, 7) n = TextFace('') #lf.add_face(n, 6, position="aligned") tss.aligned_header.add_face(n, 6) if taxon_id2completeness: n = TextFace('Completeness (%)') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") tss.aligned_header.add_face(n, 9) n = TextFace('') #lf.add_face(n, 6, position="aligned") tss.aligned_header.add_face(n, 8) n = TextFace('Contamination (%)') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") tss.aligned_header.add_face(n, 11) n = TextFace('') #lf.add_face(n, 6, position="aligned") tss.aligned_header.add_face(n, 10) value += 1 #print '------ %s' % lf.name if exclude_outgroup and i == 0: lf.name = taxon2description[lf.name] #print '#######################' continue n = TextFace( ' %s ' % str(round(taxon2genome_size[lf.name] / float(1000000), 2))) n.margin_top = 1 n.margin_right = 1 n.margin_left = 0 n.margin_bottom = 1 n.fsize = 7 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, 2, position="aligned") #if max_genome_size > 3424182: # max_genome_size = 3424182 fraction_biggest = (float(taxon2genome_size[lf.name]) / max_genome_size) * 100 fraction_rest = 100 - fraction_biggest if taxon2description[lf.name] == 'Rhabdochlamydia helveticae T3358': col = '#fc8d59' else: if not bw_scale: col = rgb2hex(m1.to_rgba(float( taxon2genome_size[lf.name]))) # 'grey' else: col = '#fc8d59' b = StackedBarFace([fraction_biggest, fraction_rest], width=100, height=9, colors=[col, 'white']) b.rotation = 0 b.inner_border.color = "black" b.inner_border.width = 0 b.margin_right = 15 b.margin_left = 0 lf.add_face(b, 3, position="aligned") fraction_biggest = (float(taxon2gc[lf.name]) / max_gc) * 100 fraction_rest = 100 - fraction_biggest if taxon2description[lf.name] == 'Rhabdochlamydia helveticae T3358': col = '#91bfdb' else: if not bw_scale: col = rgb2hex(m2.to_rgba(float(taxon2gc[lf.name]))) else: col = '#91bfdb' b = StackedBarFace([fraction_biggest, fraction_rest], width=100, height=9, colors=[col, 'white']) b.rotation = 0 b.inner_border.color = "black" b.inner_border.width = 0 b.margin_left = 0 b.margin_right = 15 lf.add_face(b, 5, position="aligned") n = TextFace(' %s ' % str(round(float(taxon2gc[lf.name]), 2))) n.margin_top = 1 n.margin_right = 0 n.margin_left = 0 n.margin_bottom = 1 n.fsize = 7 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, 4, position="aligned") if taxon2description[lf.name] == 'Rhabdochlamydia helveticae T3358': col = '#99d594' else: if not bw_scale: col = rgb2hex(m3.to_rgba(float(taxon2coding_density[lf.name]))) else: col = '#99d594' n = TextFace(' %s ' % str(float(taxon2coding_density[lf.name]))) n.margin_top = 1 n.margin_right = 0 n.margin_left = 0 n.margin_right = 0 n.margin_bottom = 1 n.fsize = 7 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, 6, position="aligned") fraction = (float(taxon2coding_density[lf.name]) / max(taxon2coding_density.values())) * 100 fraction_rest = ((max(taxon2coding_density.values()) - taxon2coding_density[lf.name]) / float(max(taxon2coding_density.values()))) * 100 #print 'fraction, rest', fraction, fraction_rest b = StackedBarFace( [fraction, fraction_rest], width=100, height=9, colors=[col, 'white' ]) # 1-round(float(taxon2coding_density[lf.name]), 2) b.rotation = 0 b.margin_right = 1 b.inner_border.color = "black" b.inner_border.width = 0 b.margin_left = 5 lf.add_face(b, 7, position="aligned") if taxon_id2completeness: n = TextFace(' %s ' % str(float(taxon_id2completeness[lf.name]))) n.margin_top = 1 n.margin_right = 0 n.margin_left = 0 n.margin_right = 0 n.margin_bottom = 1 n.fsize = 7 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, 8, position="aligned") fraction = float(taxon_id2completeness[lf.name]) fraction_rest = 100 - fraction #print 'fraction, rest', fraction, fraction_rest b = StackedBarFace( [fraction, fraction_rest], width=100, height=9, colors=["#d7191c", 'white' ]) # 1-round(float(taxon2coding_density[lf.name]), 2) b.rotation = 0 b.margin_right = 1 b.inner_border.color = "black" b.inner_border.width = 0 b.margin_left = 5 lf.add_face(b, 9, position="aligned") n = TextFace(' %s ' % str(float(taxon_id2contamination[lf.name]))) n.margin_top = 1 n.margin_right = 0 n.margin_left = 0 n.margin_right = 0 n.margin_bottom = 1 n.fsize = 7 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, 10, position="aligned") fraction = float(taxon_id2contamination[lf.name]) fraction_rest = 100 - fraction #print 'fraction, rest', fraction, fraction_rest b = StackedBarFace( [fraction, fraction_rest], width=100, height=9, colors=["black", 'white' ]) # 1-round(float(taxon2coding_density[lf.name]), 2) b.rotation = 0 b.margin_right = 1 b.inner_border.color = "black" b.inner_border.width = 0 b.margin_left = 5 lf.add_face(b, 11, position="aligned") #lf.name = taxon2description[lf.name] n = TextFace(taxon2description[lf.name], fgcolor="black", fsize=9, fstyle='italic') n.margin_right = 30 lf.add_face(n, 0) for n in t1.traverse(): nstyle = NodeStyle() if n.support < 1: nstyle["fgcolor"] = "black" nstyle["size"] = 6 n.set_style(nstyle) else: nstyle["fgcolor"] = "red" nstyle["size"] = 0 n.set_style(nstyle) return t1, tss
def plot_tree_text_metadata(tree_file, header2taxon2text, ordered_header_list, biodb): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) t1 = Tree(tree_file) taxon2description = manipulate_biosqldb.taxon_id2genome_description( server, biodb, filter_names=True) # Calculate the midpoint node R = t1.get_midpoint_outgroup() # and set it as tree outgroup t1.set_outgroup(R) tss = TreeStyle() tss.draw_guiding_lines = True tss.guiding_lines_color = "gray" tss.show_leaf_name = False for i, leaf in enumerate(t1.iter_leaves()): # first leaf, add headers if i == 0: for column, header in enumerate(ordered_header_list): n = TextFace('%s' % (header)) n.margin_top = 0 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.rotation = 270 n.hz_align = 2 n.vt_align = 2 n.inner_background.color = "white" n.opacity = 1. tss.aligned_header.add_face(n, column) for column, header in enumerate(ordered_header_list): text = header2taxon2text[header][int(leaf.name)] n = TextFace('%s' % text) n.margin_top = 1 n.margin_right = 1 n.margin_left = 5 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. #n.rotation = 270 leaf.add_face(n, column + 1, position="aligned") # rename leaf (taxon_id => description) n = TextFace(taxon2description[leaf.name], fgcolor="black", fsize=12, fstyle='italic') leaf.add_face(n, 0) for n in t1.traverse(): # rename leaf nstyle = NodeStyle() if n.support < 1: nstyle["fgcolor"] = "black" nstyle["size"] = 6 n.set_style(nstyle) else: nstyle["fgcolor"] = "red" nstyle["size"] = 0 n.set_style(nstyle) return t1, tss
def plot_tree_barplot(tree_file, taxon2value_list_barplot, header_list, taxon2set2value_heatmap=False, header_list2=False, presence_only=True, biodb="chlamydia_04_16", column_scale=True, general_max=False, barplot2percentage=False): ''' display one or more barplot :param tree_file: :param taxon2value_list: :param biodb: :param exclude_outgroup: :param bw_scale: :param barplot2percentage: list of bool to indicates if the number are percentages and the range should be set to 0-100 :return: ''' from chlamdb.biosqldb import manipulate_biosqldb import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl server, db = manipulate_biosqldb.load_db(biodb) taxon2description = manipulate_biosqldb.taxon_id2genome_description( server, biodb, filter_names=True) #print isinstance(tree_file, Tree) #print type(tree_file) if isinstance(tree_file, Tree): t1 = tree_file else: t1 = Tree(tree_file) # Calculate the midpoint node R = t1.get_midpoint_outgroup() # and set it as tree outgroup t1.set_outgroup(R) tss = TreeStyle() value = 1 tss.draw_guiding_lines = True tss.guiding_lines_color = "gray" tss.show_leaf_name = False if column_scale and header_list2: import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl column2scale = {} for column in header_list2: values = taxon2set2value_heatmap[column].values() norm = mpl.colors.Normalize(vmin=min(values), vmax=max(values)) cmap = cm.OrRd m = cm.ScalarMappable(norm=norm, cmap=cmap) column2scale[column] = m cmap = cm.YlGnBu #YlOrRd#OrRd values_lists = taxon2value_list_barplot.values() scale_list = [] max_value_list = [] for n, header in enumerate(header_list): #print 'scale', n, header data = [float(i[n]) for i in values_lists] if barplot2percentage is False: max_value = max(data) #3424182# min_value = min(data) #48.23 else: if barplot2percentage[n] is True: max_value = 100 min_value = 0 else: max_value = max(data) #3424182# min_value = min(data) #48.23 norm = mpl.colors.Normalize(vmin=min_value, vmax=max_value) m1 = cm.ScalarMappable(norm=norm, cmap=cmap) scale_list.append(m1) if not general_max: max_value_list.append(float(max_value)) else: max_value_list.append(general_max) for i, lf in enumerate(t1.iter_leaves()): #if taxon2description[lf.name] == 'Pirellula staleyi DSM 6068': # lf.name = 'Pirellula staleyi DSM 6068' # continue if i == 0: col_add = 0 for col, header in enumerate(header_list): #lf.add_face(n, column, position="aligned") n = TextFace(' ') n.margin_top = 1 n.margin_right = 2 n.margin_left = 2 n.margin_bottom = 1 n.rotation = 90 n.inner_background.color = "white" n.opacity = 1. n.hz_align = 2 n.vt_align = 2 tss.aligned_header.add_face(n, col_add) n = TextFace('%s' % header) n.margin_top = 1 n.margin_right = 2 n.margin_left = 2 n.margin_bottom = 80 n.rotation = 270 n.inner_background.color = "white" n.opacity = 1. n.hz_align = 2 n.vt_align = 2 tss.aligned_header.add_face(n, col_add + 1) col_add += 2 if header_list2: for col, header in enumerate(header_list2): n = TextFace('%s' % header) n.margin_top = 1 n.margin_right = 20 n.margin_left = 2 n.margin_bottom = 1 n.rotation = 270 n.hz_align = 2 n.vt_align = 2 n.inner_background.color = "white" n.opacity = 1. tss.aligned_header.add_face(n, col + col_add) try: val_list = taxon2value_list_barplot[lf.name] except: try: val_list = taxon2value_list_barplot[int(lf.name)] except: val_list = [0] col_add = 0 for col, value in enumerate(val_list): # show value itself try: n = TextFace(' %s ' % str(value)) except: n = TextFace(' %s ' % str(value)) n.margin_top = 1 n.margin_right = 10 n.margin_left = 2 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col_add, position="aligned") # show bar color = rgb2hex(scale_list[col].to_rgba(float(value))) try: percentage = (value / max_value_list[col]) * 100 #percentage = value except: percentage = 0 maximum_bar = ( (max_value_list[col] - value) / max_value_list[col]) * 100 #maximum_bar = 100-percentage b = StackedBarFace([percentage, maximum_bar], width=100, height=10, colors=[color, "white"]) b.rotation = 0 b.inner_border.color = "grey" b.inner_border.width = 0 b.margin_right = 15 b.margin_left = 0 lf.add_face(b, col_add + 1, position="aligned") col_add += 2 if taxon2set2value_heatmap: shift = col + col_add + 1 i = 0 for col, col_name in enumerate(header_list2): try: value = taxon2set2value_heatmap[col_name][lf.name] except: try: value = taxon2set2value_heatmap[col_name][int(lf.name)] except: value = 0 if int(value) > 0: if int(value) > 9: n = TextFace(' %i ' % int(value)) else: n = TextFace(' %i ' % int(value)) n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.fgcolor = "white" n.inner_background.color = rgb2hex( column2scale[col_name].to_rgba( float(value))) #"orange" n.opacity = 1. lf.add_face(n, col + col_add, position="aligned") i += 1 else: n = TextFace(' ') #% str(value)) n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col + col_add, position="aligned") #lf.name = taxon2description[lf.name] try: n = TextFace(taxon2description[lf.name], fgcolor="black", fsize=12, fstyle='italic') except: n = TextFace(lf.name, fgcolor="black", fsize=12, fstyle='italic') lf.add_face(n, 0) for n in t1.traverse(): nstyle = NodeStyle() if n.support < 1: nstyle["fgcolor"] = "black" nstyle["size"] = 6 n.set_style(nstyle) else: nstyle["fgcolor"] = "red" nstyle["size"] = 0 n.set_style(nstyle) #print t1 return t1, tss
def plot_heatmap_tree_locus(biodb, tree_file, taxid2count, taxid2identity=False, taxid2locus=False, reference_taxon=False, n_paralogs_barplot=False): ''' plot tree and associated heatmap with count of homolgs optional: - add identity of closest homolog - add locus tag of closest homolog ''' from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) taxid2organism = manipulate_biosqldb.taxon_id2genome_description( server, biodb, True) t1 = Tree(tree_file) ts = TreeStyle() ts.draw_guiding_lines = True ts.guiding_lines_color = "gray" # Calculate the midpoint node R = t1.get_midpoint_outgroup() # and set it as tree outgroup t1.set_outgroup(R) leaf_number = 0 for lf in t1.iter_leaves(): if str(lf.name) not in taxid2count: taxid2count[str(lf.name)] = 0 max_count = max([taxid2count[str(lf.name)] for lf in t1.iter_leaves()]) for i, lf in enumerate(t1.iter_leaves()): # top leaf, add header if i == 0: n = TextFace('Number of homologs') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") ts.aligned_header.add_face(n, 1) if taxid2identity: n = TextFace('Protein identity') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") ts.aligned_header.add_face(n, 2) if taxid2locus: n = TextFace('Locus tag') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") ts.aligned_header.add_face(n, 3) leaf_number += 1 lf.branch_vertical_margin = 0 data = [taxid2count[str(lf.name)]] # possibility to add one or more columns for col, value in enumerate(data): col_index = col if value > 0: n = TextFace(' %s ' % str(value)) n.margin_top = 2 n.margin_right = 2 if col == 0: n.margin_left = 20 else: n.margin_left = 2 n.margin_bottom = 2 n.inner_background.color = "white" # #81BEF7 n.opacity = 1. lf.add_face(n, col, position="aligned") else: n = TextFace(' %s ' % str(value)) n.margin_top = 2 n.margin_right = 2 if col == 0: n.margin_left = 20 else: n.margin_left = 2 n.margin_bottom = 2 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col, position="aligned") # optionally indicate number of paralogs as a barplot if n_paralogs_barplot: col_index += 1 percent = (float(value) / max_count) * 100 n = StackedBarFace([percent, 100 - percent], width=150, height=18, colors=['#6699ff', 'white'], line_color='white') n.rotation = 0 n.inner_border.color = "white" n.inner_border.width = 0 n.margin_right = 15 n.margin_left = 0 lf.add_face(n, col + 1, position="aligned") # optionally add additionnal column with identity if taxid2identity: import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl norm = mpl.colors.Normalize(vmin=0, vmax=100) cmap = cm.OrRd m = cm.ScalarMappable(norm=norm, cmap=cmap) try: if round(taxid2identity[str(lf.name)], 2) != 100: value = "%.2f" % round(taxid2identity[str(lf.name)], 2) else: value = "%.1f" % round(taxid2identity[str(lf.name)], 2) except: value = '-' if str(lf.name) == str(reference_taxon): value = ' ' n = TextFace(' %s ' % value) n.margin_top = 2 n.margin_right = 2 n.margin_left = 20 n.margin_bottom = 2 if not value.isspace() and value is not '-': n.inner_background.color = rgb2hex(m.to_rgba(float(value))) if float(value) > 82: n.fgcolor = 'white' n.opacity = 1. if str(lf.name) == str(reference_taxon): n.inner_background.color = '#800000' lf.add_face(n, col_index + 1, position="aligned") # optionaly add column with locus name if taxid2locus: try: value = str(taxid2locus[str(lf.name)]) except: value = '-' n = TextFace(' %s ' % value) n.margin_top = 2 n.margin_right = 2 n.margin_left = 2 n.margin_bottom = 2 if str(lf.name) != str(reference_taxon): n.inner_background.color = "white" else: n.fgcolor = '#ff0000' n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col_index + 2, position="aligned") lf.name = taxid2organism[str(lf.name)] return t1, leaf_number, ts
def plot_heat_tree(biodb, taxid2n, tree_file): ''' Plot heatmap next to a tree. The order of the heatmap **MUST** be the same, as order of the leafs on the tree. The tree must be in the Newick format. If *output_file* is specified, then heat-tree will be rendered as a PNG, otherwise interactive browser will pop-up with your heat-tree. Parameters ---------- heatmap_file: str Path to the heatmap file. The first row must have '#Names' as first element of the header. e.g. #Names, A, B, C, D row1, 2, 4, 0, 4 row2, 4, 6, 2, -1 tree_file: str Path to the tree file in Newick format. The leaf node labels should be the same as as row names in the heatmap file. E.g. row1, row2. output_file: str, optional If specified the heat-tree will be rendered in that file as a PNG image, otherwise interactive browser will pop-up. **N.B.** program will wait for you to exit the browser before continuing. ''' from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) taxid2organism = manipulate_biosqldb.taxon_id2genome_description( server, biodb, True) t1 = Tree(tree_file) # Calculate the midpoint node R = t1.get_midpoint_outgroup() # and set it as tree outgroup t1.set_outgroup(R) leaf_number = 0 for lf in t1.iter_leaves(): leaf_number += 1 lf.branch_vertical_margin = 0 try: data = [taxid2n[str(lf.name)]] except: data = [0] #print 'taxon', int(lf.name) lf.name = taxid2organism[int(lf.name)] for col, value in enumerate(data): if value > 0: n = TextFace(' %s ' % str(value)) n.margin_top = 2 n.margin_right = 2 n.margin_left = 2 n.margin_bottom = 2 n.inner_background.color = "#81BEF7" n.opacity = 1. lf.add_face(n, col, position="aligned") else: n = TextFace(' %s ' % str(value)) n.margin_top = 2 n.margin_right = 2 n.margin_left = 2 n.margin_bottom = 2 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col, position="aligned") return t1, leaf_number