def get_overlap_threshold(inputs, pp_flag, G=[]): if pp_flag: comps = preprocess_complexes(inputs['dir_nm'] + inputs['comf_nm'], ' ', G) comps = [set(list(g.nodes)) for g in comps] else: with open(inputs['dir_nm'] + inputs['comf_nm']) as f: comps = [set(line.rstrip().split()) for line in f.readlines()] n_comps = len(comps) jcs = [] for i in range(n_comps): for j in range(i + 1, n_comps): jc = jaccard_coeff(comps[i], comps[j]) jcs.append(jc) jcs = [jc for jc in jcs if jc != 0] if len(jcs) == 0: return 0 q1 = np_percentile(jcs, 25) q2 = np_percentile(jcs, 50) return float(q1 + 0.5 * (q2 - q1))
def get_overlap_threshold_qi(inputs, pp_flag, G=[]): if pp_flag: comps = preprocess_complexes(inputs['dir_nm'] + inputs['comf_nm'], ' ', G) comps = [set(list(g.nodes)) for g in comps] else: with open(inputs['dir_nm'] + inputs['comf_nm']) as f: comps = [set(line.rstrip().split()) for line in f.readlines()] n_comps = len(comps) jcs = [] for i in range(n_comps): for j in range(i + 1, n_comps): jc = NA_threshold(comps[i], comps[j]) jcs.append(jc) jcs = [jc for jc in jcs if jc != 0] if len(jcs) == 0: return 0 q1 = np_percentile(jcs, 25) min_jc = np_percentile(jcs, 2) return float(min_jc + (q1 - min_jc) / 2.0) #inputs = dict() ##inputs['dir_nm'] = 'humap' ##inputs['comf_nm'] = '/res_train_complexes_new_73_more.txt' # #inputs['dir_nm'] = 'yeast' #inputs['comf_nm'] = '/TAP-MS.txt' ##inputs['comf_nm'] = '/mips.txt' # ##inputs['dir_nm'] = 'toy_network' ##inputs['comf_nm'] = '/train_complexes.txt' # #pp_flag = 1 #inputs['graph_files_dir']='/graph_files' #myGraphName = inputs['dir_nm'] + inputs['graph_files_dir']+ "/res_myGraph" #with open(myGraphName, 'rb') as f: # myGraph = pickle_load(f) # #sol1 = get_overlap_threshold(inputs,pp_flag,myGraph) #sol2=get_overlap_threshold_qi(inputs,pp_flag,myGraph) #print(sol1) #print(sol2)
def __writer(self, num_species, output_dir, writer_queue): """Write results for each species.""" # gather results for each genome output_file = os.path.join(output_dir, 'ani_species.tsv') fout = open(output_file, 'w') fout.write('Species\tNo. Sampled Genomes\tMean ANI\tMedian ANI\t5th Percentile\t95th Percentile') fout.write('\tMean AF\tMedian AF\t5th Percentile\t95th Percentile') fout.write('\tSampled Genomes\n') output_file = os.path.join(output_dir, 'ani.tsv') fout_pw = open(output_file, 'w') fout_pw.write('Species\tGenome 1\tGenome 2\tANI(1->2)\tANI(2->1)\tAF(1->2)\tAF(2->1)\n') processed = 0 while True: species, ani, af, genome_ids, results = writer_queue.get(block=True, timeout=None) if species == None: break processed += 1 statusStr = 'Finished processing %d of %d (%.2f%%) species.' % (processed, num_species, float(processed) * 100 / num_species) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() fout_pw.write(results) row = '%s\t%d' % (species, len(genome_ids)) mean_ani = np_mean(ani) p5, median, p95 = np_percentile(ani, [5, 50, 95]) row += '\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_ani, median, p5, p95) mean_af = np_mean(af) p5, median, p95 = np_percentile(af, [5, 50, 95]) row += '\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_af*100, median*100, p5*100, p95*100) fout.write('%s\t%s\n' % (row, ','.join(genome_ids))) sys.stdout.write('\n') fout.close() fout_pw.close()
def _gene_distribution(self, seq_file): """Calculate length distribution of sequences.""" gene_lens = [] for seq_id, seq in seq_io.read_seq(seq_file): gene_lens.append(len(seq)) p10, p50, p90 = np_percentile(gene_lens, [10, 50, 90]) return np_mean(gene_lens), max(gene_lens), min( gene_lens), p10, p50, p90
def _render_collapsed_rectangular(self, node, collapsed_group): """Render collapsed lineage in rectangular tree.""" # get length of branches leaf_dists = [] for leaf in node.preorder_iter(lambda n: n.is_leaf()): leaf_dists.append(dist_to_ancestor(leaf, node)) branch1, branch2 = np_percentile(leaf_dists, [ self.collapse_branch1_percentile, self.collapse_branch2_percentile ]) branch1 = (branch1 / self.deepest_node) * self.height branch2 = (branch2 / self.deepest_node) * self.height if self.collapse_display_method == 'TRIANGLE': branch2 = 0 # render collapsed lineage _support, taxon, _aux_info = parse_label(node.label) lineage_name, color, alpha, stroke_width, stroke_color = self.collapse_map[ node] pts = [] pts.append((node.x, node.y + 0.5 * node.collapsed_height)) pts.append((node.x, node.y - 0.5 * node.collapsed_height)) pts.append((node.x + branch1, node.y - 0.5 * node.collapsed_height)) pts.append((node.x + branch2, node.y + 0.5 * node.collapsed_height)) p = self.dwg.polygon(points=pts) p.fill(color=color, opacity=alpha) p.stroke(color=stroke_color, width=stroke_width) collapsed_group.add(p) if self.collapse_show_labels: if self.collapse_label_position == 'INTERNAL': label_x = node.x + 0.01 * self.inch elif self.collapse_label_position == 'EXTERNAL': label_x = max(node.x + branch1, node.x + branch2) label = lineage_name if self.collapse_show_leaf_count: label += ' [%d]' % len(leaf_dists) render_label(self.dwg, label_x, node.y, 0, label, self.collapse_font_size, self.collapse_font_color, collapsed_group)
def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir): """Calculate distribution of branch lengths at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. taxonomy_file : str File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree). output_dir : str Desired output directory. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference( tree, taxonomy, set(), min_children, -1) # determine branch lengths to leaves for named lineages rank_bl_dist = defaultdict(list) taxa_bl_dist = defaultdict(list) taxa_at_rank = defaultdict(list) for node in tree.postorder_node_iter(): if node.is_leaf() or not node.label: continue _support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue # get most specific rank in multi-rank taxa string taxa = [t.strip() for t in taxon.split(';')] taxon = taxa[-1] most_specific_rank = taxon[0:3] taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon) for n in node.leaf_iter(): dist_to_node = self._dist_to_ancestor(n, node) for t in taxa: taxa_bl_dist[t].append(dist_to_node) rank = Taxonomy.rank_labels[ Taxonomy.rank_index[most_specific_rank]] if rank != 'species' or Taxonomy().validate_species_name(taxon): if taxon in taxa_for_dist_inference: rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon])) # report number of taxa at each rank print('') print('Rank\tTaxa\tTaxa for Inference') for rank, taxa in taxa_at_rank.items(): taxa_for_inference = [ x for x in taxa if x in taxa_for_dist_inference ] print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))) print('') # report results sorted by rank sorted_taxon = [] for rank_prefix in Taxonomy.rank_prefixes: taxa_at_rank = [] for taxon in taxa_bl_dist: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) # report results for each named group taxa_file = os.path.join(output_dir, '%s.taxa_bl_dist.tsv' % input_tree_name) fout = open(taxa_file, 'w') fout.write( 'Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n' ) for taxon in sorted_taxon: dist = taxa_bl_dist[taxon] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write( '%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each taxonomic rank rank_file = os.path.join(output_dir, '%s.rank_bl_dist.tsv' % input_tree_name) fout = open(rank_file, 'w') fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for rank in Taxonomy.rank_labels: dist = rank_bl_dist[rank] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each node output_bl_file = os.path.join(output_dir, '%s.node_bl_dist.tsv' % input_tree_name) self._write_bl_dist(tree, output_bl_file)
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] no_inference = [] for clade_label, dists in medians_for_taxa[rank].iteritems(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(md) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(md) else: c.append((0.0, 0.0, 1.0)) mono.append(md) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(medians_for_taxa))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. rel_dist_thresholds: list Relative distances cutoffs for defining ranks. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [ dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference ] u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [ dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference ] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2) ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2) ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write( 'Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n' ) x = [] y = [] c = [] labels = [] rank_labels = [] rel_dist_thresholds += [1.0] # append boundry for species for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if clade_label in taxa_for_dist_inference: c.append((0.0, 0.0, 0.5)) else: c.append((0.5, 0.5, 0.5)) p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) if i == 0: rank_cutoff = rel_dist_thresholds[i] rank_outlier = dist > rank_cutoff else: rank_cutoff = rel_dist_thresholds[i] upper_rank_cutoff = rel_dist_thresholds[i - 1] rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff) v = [clade_label, dist, rank_cutoff, str(rank_outlier)] v += percentiles[i] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) fout.close() scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # plot relative divergence threshold lines y_min, y_max = ax.get_ylim() for threshold in rel_dist_thresholds[ 0:-1]: # don't draw species boundary ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--') ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center') # make plot interactive mpld3.plugins.connect( self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=96)
def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. rel_dist_thresholds: list Relative distances cutoffs for defining ranks. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2) ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2) ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] rel_dist_thresholds += [1.0] # append boundry for species for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if clade_label in taxa_for_dist_inference: c.append((0.0, 0.0, 0.5)) else: c.append((0.5, 0.5, 0.5)) p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) if i == 0: rank_cutoff = rel_dist_thresholds[i] rank_outlier = dist > rank_cutoff else: rank_cutoff = rel_dist_thresholds[i] upper_rank_cutoff = rel_dist_thresholds[i - 1] rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff) v = [clade_label, dist, rank_cutoff, str(rank_outlier)] v += percentiles[i] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) fout.close() scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # plot relative divergence threshold lines y_min, y_max = ax.get_ylim() for threshold in rel_dist_thresholds[0:-1]: # don't draw species boundary ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--') ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center') # make plot interactive mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=96)
def feature_extract(inputs, complex_graphs, test_complex_graphs, G): G_nodes = G.nodes() n_feats = inputs['feats'] out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm'] mode = inputs['mode'] # mode = "non_gen" # Change to gen if you want to generate matrices # n_pos = len(complex_graphs) sizes = [len(comp) for comp in complex_graphs] # get quartiles q1 = np_percentile(sizes, 25) q3 = np_percentile(sizes, 75) max_wo_outliers = math_ceil(q3 + 4.5 * (q3 - q1)) # Maximum after removing outliers max_size_train = max(sizes) recommended_max_size = min(max_size_train, max_wo_outliers) max_sizeF = inputs['dir_nm'] + inputs[ 'train_test_files_dir'] + "/res_max_size_search" with open(max_sizeF, 'wb') as f: pickle_dump(recommended_max_size, f) # n_pos_test = len(test_complex_graphs) sizes_test = [len(comp) for comp in test_complex_graphs] max_size_test = max(sizes_test) fig = plt.figure() # Plot box plot of sizes to know the outliers (for setting step size in sampling) sns_boxplot(sizes) plt.xlabel("Size") plt.title("Size distribution of training complexes") plt.savefig(out_comp_nm + "_known_train_size_dist_box_plot") plt.close(fig) fig = plt.figure() # Plot box plot of sizes to know the outliers (for setting step size in sampling) sns_boxplot(sizes + sizes_test) plt.xlabel("Size") plt.title("Size distribution of known complexes") plt.savefig(out_comp_nm + "_known_size_dist_box_plot") plt.close(fig) if inputs[ 'model_type'] == "tpot" and mode == "non_gen": # CHANGE X_POS, Y_POS later !!!! logging_info("Reading labeled feature matrix from file...") # Read X,y from csv file y, X, X_pos, y_pos, X_neg, y_neg = read_from_csv( inputs['train_feat_mat']) y_test, X_test, X_pos_test, y_pos_test, X_neg_test, y_neg_test = read_from_csv( inputs['test_feat_mat']) logging_info("Finished reading feature matrix") else: logging_info("Feature extraction...") feat_list = [ "dens", "nodes", "degree_max", "degree_mean", "degree_median", "degree_var", "CC_max", "CC_mean", "CC_var", "edge_wt_mean", "edge_wt_max", "edge_wt_var", "DC_mean", "DC_var", "DC_max", "sv1", "sv2", "sv3", "complex" ] X_pos = create_feat_mat(complex_graphs, n_feats) X_pos_test = create_feat_mat(test_complex_graphs, n_feats) X_allpos = np_vstack((X_pos, X_pos_test)) n_allpos = len(X_allpos) y, X, X_pos, y_pos, X_neg, y_neg = extract_features( out_comp_nm, 'train', max_size_train, inputs, G_nodes, feat_list, X_pos, X_allpos, n_allpos, sizes) y_test, X_test, X_pos_test, y_pos_test, X_neg_test, y_neg_test = extract_features( out_comp_nm, 'test', max_size_test, inputs, G_nodes, feat_list, X_pos_test, X_allpos, n_allpos, sizes_test) logging_info("Finished Feature extraction") return max_size_train, max_size_test, X_pos_test, X_neg_test, X_test, y_test, X_pos, y_pos, X, y, X_neg, y_neg
def _do_combine(hdu_no: int, progress: float, progress_step: float, data_width: int, data_height: int, input_data: List[Union[pyfits.HDUList, Tuple[ndarray, pyfits.Header]]], mode: str = 'average', scaling: Optional[str] = None, rejection: Optional[str] = None, min_keep: int = 2, percentile: float = 50.0, lo: Optional[float] = None, hi: Optional[float] = None, max_mem_mb: float = 100.0, callback: Optional[callable] = None) \ -> Tuple[Union[ndarray, ma.MaskedArray], float]: """ Combine the given HDUs from all input images; used by :func:`combine` to get a stack of either all input images or, if lucky imaging is enabled, of their subset :return: image stack data and rejection percent """ n = len(input_data) # Calculate scaling factors k_ref, k = None, [] if scaling: for data_no, f in enumerate(input_data): if isinstance(f, pyfits.HDUList): data = f[hdu_no].data else: data = f[0] if scaling == 'average': k.append(data.mean()) elif scaling == 'percentile': if percentile == 50: k.append( median(data) if not isinstance(data, ma.MaskedArray) else ma.median(data)) else: k.append( np_percentile(data, percentile) if not isinstance(data, ma.MaskedArray) else np_percentile(data.compressed(), percentile)) elif scaling == 'mode': # Compute modal values from histograms; convert to integer # and assume 2 x 16-bit data range if isinstance(data, ma.MaskedArray): data = data.compressed() else: data = data.ravel() min_val = data.min(initial=0) k.append( argmax(bincount( (data - min_val).clip(0, 2*0x10000 - 1) .astype(int32))) + min_val) else: raise ValueError( 'Unknown scaling mode "{}"'.format(scaling)) if callback is not None: callback(progress + (data_no + 1)/n/2*progress_step) # Normalize to the first frame with non-zero average; keep images # with zero or same average as is k_ref = k[0] if not k_ref: for ki in k[1:]: if ki: k_ref = ki break # Process data in chunks to fit in the maximum amount of RAM allowed rowsize = 0 for data in input_data: if isinstance(data, pyfits.HDUList): data = data[hdu_no].data else: data = data[0] rowsize += data[0].nbytes if rejection or isinstance(data, ma.MaskedArray): rowsize += data_width chunksize = min(max(int(max_mem_mb*(1 << 20)/rowsize), 1), data_height) while chunksize > 1: # Use as small chunks as possible but keep their total number if len(list(range(0, data_height, chunksize - 1))) > \ len(list(range(0, data_height, chunksize))): break chunksize -= 1 chunks = [] rej_percent = 0 for chunk in range(0, data_height, chunksize): datacube = [ f[hdu_no].data[chunk:chunk + chunksize] if isinstance(f, pyfits.HDUList) else f[0][chunk:chunk + chunksize] for f in input_data ] if k_ref: # Scale data for data, ki in zip(datacube, k): if ki not in (0, k_ref): data *= k_ref/ki # Reject outliers if rejection or any(isinstance(data, ma.MaskedArray) for data in datacube): datacube = ma.masked_array(datacube) if not datacube.mask.shape: # No initially masked data, but we'll need an array instead # of mask=False to do slicing operations datacube.mask = full(datacube.shape, datacube.mask) else: datacube = array(datacube) if rejection == 'chauvenet': datacube.mask = chauvenet(datacube, min_vals=min_keep) elif rejection == 'iraf': if lo is None: lo = 1 if hi is None: hi = 1 if n - (lo + hi) < min_keep: raise ValueError( 'IRAF rejection with lo={}, hi={} would keep less than ' '{} values for a {}-image set'.format(lo, hi, min_keep, n)) if lo or hi: # Mask "lo" smallest values and "hi" largest values along # the 0th axis order = datacube.argsort(0) mg = tuple(i.ravel() for i in indices(datacube.shape[1:])) for j in range(-hi, lo): datacube.mask[(order[j].ravel(),) + mg] = True del order, mg elif rejection == 'minmax': if lo is not None and hi is not None: if lo > hi: raise ValueError( 'lo={} > hi={} for minmax rejection'.format(lo, hi)) datacube.mask[((datacube < lo) | (datacube > hi)).nonzero()] = True if datacube.mask.all(0).any(): logging.warning( '%d completely masked pixels left after minmax ' 'rejection', datacube.mask.all(0).sum()) elif rejection == 'sigclip': if lo is None: lo = 3 if hi is None: hi = 3 if lo < 0 or hi < 0: raise ValueError( 'Lower and upper limits for sigma clipping must be ' 'positive, got lo={}, hi={}'.format(lo, hi)) max_rej = n - min_keep while True: avg = datacube.mean(0) sigma = datacube.std(0) resid = datacube - avg outliers = (datacube.mask.sum(0) < max_rej) & \ (sigma > 0) & ((resid < -lo*sigma) | (resid > hi*sigma)) if not outliers.any(): del avg, sigma, resid, outliers break datacube.mask[outliers.nonzero()] = True elif rejection: raise ValueError( 'Unknown rejection mode "{}"'.format(rejection)) if isinstance(datacube, ma.MaskedArray): if datacube.mask is None or not datacube.mask.any(): # Nothing was rejected datacube = datacube.data else: # Calculate the percentage of rejected pixels rej_percent += datacube.mask.sum() # Combine data if mode == 'average': res = datacube.mean(0) elif mode == 'sum': res = datacube.sum(0) elif mode == 'percentile': if percentile == 50: if isinstance(datacube, ma.MaskedArray): res = ma.median(datacube, 0) else: res = median(datacube, 0) else: if isinstance(datacube, ma.MaskedArray): res = nanpercentile( datacube.filled(nan), percentile, 0) else: res = np_percentile(datacube, percentile, 0) else: raise ValueError('Unknown stacking mode "{}"'.format(mode)) chunks.append(res) if callback is not None: callback( progress + ((0.5 if scaling else 0) + min(chunk + chunksize, data_height)/data_height / (2 if scaling else 1))*progress_step) if len(chunks) > 1: res = ma.vstack(chunks) else: res = chunks[0] if isinstance(res, ma.MaskedArray) and ( res.mask is None or not res.mask.any()): res = res.data return res, rej_percent
def main(): parser = argparse_ArgumentParser("Input parameters") parser.add_argument("--input_file_name", default="input_toy.yaml", help="Input parameters file name") parser.add_argument("--out_dir_name", default="/results", help="Output directory name") parser.add_argument("--train_test_files_dir", default="", help="Train test file path") parser.add_argument("--graph_files_dir", default="", help="Graph files' folder path") parser.add_argument("--seed_mode", help="Seed mode - specify 'cliques' for the cliques algo") parser.add_argument("--max_size_thres", help="Max size threshold") parser.add_argument("--n_pts", default=1, help="number of partitions (computers)") args = parser.parse_args() with open(args.input_file_name, 'r') as f: inputs = yaml_load(f, yaml_Loader) if args.seed_mode: inputs['seed_mode'] = args.seed_mode if args.max_size_thres: inputs['max_size_thres'] = int(args.max_size_thres) # Override output directory name if same as gen if args.out_dir_name or inputs['out_comp_nm'] == "/results/res": if not os_path.exists(inputs['dir_nm'] + args.out_dir_name): os_mkdir(inputs['dir_nm'] + args.out_dir_name) inputs['out_comp_nm'] = args.out_dir_name + "/res" inputs['train_test_files_dir'] = '' if args.train_test_files_dir: if not os_path.exists(inputs['dir_nm'] + args.train_test_files_dir): os_mkdir(inputs['dir_nm'] + args.train_test_files_dir) inputs['train_test_files_dir'] = args.train_test_files_dir inputs['graph_files_dir'] = '' if args.graph_files_dir: if not os_path.exists(inputs['dir_nm'] + args.graph_files_dir): os_mkdir(inputs['dir_nm'] + args.graph_files_dir) inputs['graph_files_dir'] = args.graph_files_dir with open(inputs['dir_nm'] + inputs['out_comp_nm'] + "_input_sample_partition.yaml", 'w') as outfile: yaml_dump(inputs, outfile, default_flow_style=False) logging_basicConfig(filename=inputs['dir_nm'] + inputs['out_comp_nm'] + "_logs.yaml", level=logging_INFO) neig_dicts_folder = inputs['dir_nm'] +inputs['graph_files_dir']+ "/neig_dicts" num_comp = inputs['num_comp'] max_size_thres = inputs['max_size_thres'] max_size_trainF = inputs['dir_nm'] + inputs['train_test_files_dir']+ "/res_max_size_train" with open(max_size_trainF, 'rb') as f: max_size_train = pickle_load(f) max_size = max_size_train max_sizeF_feat = inputs['dir_nm'] + inputs['train_test_files_dir']+ "/res_max_size_search" if os_path.exists(max_sizeF_feat): with open(max_sizeF_feat, 'rb') as f: max_size = pickle_load(f) else: with open(inputs['dir_nm'] + inputs['comf_nm']) as f: sizes = [len(line.rstrip().split()) for line in f.readlines()] max_size = max(sizes) q1 = np_percentile(sizes, 25) q3 = np_percentile(sizes, 75) max_wo_outliers = math_ceil(q3 + 4.5*(q3-q1)) # Maximum after removing outliers max_size = min(max_size,max_wo_outliers) if max_size >= max_size_thres: max_size = max_size_thres out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm'] with open(out_comp_nm + '_metrics.out', "a") as fid: print("Max number of steps for complex growth = ", max_size, file=fid) # NOT actual max size since you merge later max_sizeF = inputs['dir_nm'] + inputs['train_test_files_dir']+ "/res_max_size_search_par" with open(max_sizeF, 'wb') as f: pickle_dump(max_size, f) seed_mode = inputs['seed_mode'] if seed_mode == "all_nodes": #graph_nodes = list(myGraph.nodes()) seed_nodes = rand_perm(os_listdir(neig_dicts_folder)) elif seed_mode == "n_nodes": seed_nodes = rand_perm(os_listdir(neig_dicts_folder))[:num_comp] elif seed_mode == "all_nodes_known_comp": protlistfname = inputs['dir_nm']+ inputs['train_test_files_dir'] + "/res_protlist" with open(protlistfname, 'rb') as f: prot_list = pickle_load(f) seed_nodes = list(prot_list) elif seed_mode == "cliques": myGraphName = inputs['dir_nm'] + inputs['graph_files_dir']+ "/res_myGraph" with open(myGraphName, 'rb') as f: myGraph = pickle_load(f) clique_list = list(nx_find_cliques(myGraph)) to_rem = [] # Removing 2 node and big complexes for comp in clique_list: if len(comp) <= 2 or len(comp) >= max_size: to_rem.append(comp) for comp in to_rem: clique_list.remove(comp) seed_nodes = clique_list # Remove duplicates later. # partition ptns = int(args.n_pts) nc = len(seed_nodes) if seed_mode == 'n_nodes': seed_nodes_F = out_comp_nm + "_seed_nodes" each_ptn = nc // ptns for i in range(ptns - 1): with open(seed_nodes_F + str(i), 'wb') as f: pickle_dump(seed_nodes[i * each_ptn:(i + 1) * each_ptn], f) with open(seed_nodes_F + str(ptns - 1), 'wb') as f: pickle_dump(seed_nodes[(ptns - 1) * each_ptn:], f) else: seed_nodes_dir = inputs['dir_nm'] + inputs['graph_files_dir']+ "/" + seed_mode + "_n_pts_" + str(ptns) if not os_path.exists(seed_nodes_dir): os_mkdir(seed_nodes_dir) seed_nodes_F = seed_nodes_dir + "/res_seed_nodes" each_ptn = nc // ptns for i in range(ptns - 1): with open(seed_nodes_F + str(i), 'wb') as f: pickle_dump(seed_nodes[i * each_ptn:(i + 1) * each_ptn], f) with open(seed_nodes_F + str(ptns - 1), 'wb') as f: pickle_dump(seed_nodes[(ptns - 1) * each_ptn:], f)
def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir): """Calculate distribution of branch lengths at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. taxonomy_file : str File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree). output_dir : str Desired output directory. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, set(), min_children, -1) # determine branch lengths to leaves for named lineages rank_bl_dist = defaultdict(list) taxa_bl_dist = defaultdict(list) taxa_at_rank = defaultdict(list) for node in tree.postorder_node_iter(): if node.is_leaf() or not node.label: continue _support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue # get most specific rank in multi-rank taxa string taxa = [t.strip() for t in taxon.split(';')] taxon = taxa[-1] most_specific_rank = taxon[0:3] taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon) for n in node.leaf_iter(): dist_to_node = 0 while n != node: dist_to_node += n.edge_length n = n.parent_node for t in taxa: taxa_bl_dist[t].append(dist_to_node) rank = Taxonomy.rank_labels[Taxonomy.rank_index[most_specific_rank]] if rank != 'species' or Taxonomy().validate_species_name(taxon): if taxon in taxa_for_dist_inference: rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon])) # report number of taxa at each rank print '' print 'Rank\tTaxa\tTaxa for Inference' for rank, taxa in taxa_at_rank.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # report results sorted by rank sorted_taxon = [] for rank_prefix in Taxonomy.rank_prefixes: taxa_at_rank = [] for taxon in taxa_bl_dist: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) # report results for each named group taxa_file = os.path.join(output_dir, 'taxa_bl_dist.tsv') fout = open(taxa_file, 'w') fout.write('Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for taxon in sorted_taxon: dist = taxa_bl_dist[taxon] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each taxonomic rank rank_file = os.path.join(output_dir, 'rank_bl_dist.tsv') fout = open(rank_file, 'w') fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for rank in Taxonomy.rank_labels: dist = rank_bl_dist[rank] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close()
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, fmeasure, fmeasure_mono, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference] if not v: # not taxa at rank suitable for creating classification boundaries continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) #ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.0, 0.0, 1.0), lw=2, zorder=2) #ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.1, 0.1]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (0.0, 0.0, 0.0) else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label.capitalize() + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] near_mono = [] for clade_label, dists in medians_for_taxa[rank].iteritems(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if ((highlight_polyphyly and fmeasure[clade_label] < fmeasure_mono) or clade_label in highlight_taxa): c.append((1.0,0.0,0.0)) poly.append(md) elif (highlight_polyphyly and fmeasure[clade_label] != 1.0): c.append((255.0/255,187.0/255,120.0/255)) near_mono.append(md) else: c.append((152.0/255,223.0/255,138.0/255)) mono.append(md) # histogram for each rank binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) max_bin_count = max(np_histogram(mono + near_mono + poly, bins=bins)[0]) mono_bottom = 0 near_mono_bottom = 0 mono = np_array(mono) near_mono = np_array(near_mono) poly = np_array(poly) if len(mono) > 0: mono_bottom, b, p = ax.hist(mono, bins=bins, color=(152.0/255,223.0/255,138.0/255), alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(mono), bottom=i, lw=0, zorder=0) if len(near_mono) > 0: near_mono_bottom, b, p = ax.hist(near_mono, bins=bins, color=(255.0/255,187.0/255,120.0/255), alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(near_mono), bottom=i + mono_bottom, lw=0, zorder=0) if len(poly) > 0: ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(poly), bottom=i + mono_bottom + near_mono_bottom, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('Relative Evolutionary Divergence') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('Rank (no. taxa)') ax.set_yticks(xrange(0, len(medians_for_taxa))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi) self.fig.savefig(plot_file.replace('.png', '.svg'), dpi=self.dpi)
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file, viral): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [ dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference ] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p50, p50), (i, i + 0.5), c=self.median_color, lw=2, zorder=2) for b in [-0.1, 0.1]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: ax.plot((boundary, boundary), (i, i + 0.25), c=(0.0, 0.0, 0.0), lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write( 'Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): if viral: rank_label = VIRAL_RANK_LABELS[rank] else: rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label.capitalize() + ' ({:,})'.format(len(rel_dists[rank]))) mono = [] poly = [] nearly_mono = [] for clade_label, dist in rel_dists[rank].items(): x.append(dist) y.append(i) labels.append(clade_label) if ((highlight_polyphyly and fmeasure[clade_label] < fmeasure_mono) or clade_label in highlight_taxa): c.append(self.poly_color) poly.append(dist) elif (highlight_polyphyly and fmeasure[clade_label] != 1.0): c.append(self.near_mono_color) nearly_mono.append(dist) else: c.append(self.mono_color) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1, -1, -1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) max_bin_count = max( np_histogram(mono + nearly_mono + poly, bins=bins)[0]) num_taxa = len(mono) + len(poly) + len(nearly_mono) if num_taxa == 0: break mono = np_array(mono) nearly_mono = np_array(nearly_mono) poly = np_array(poly) bottom_mono = 0 if len(mono) > 0: bottom_mono, b, p = ax.hist( mono, bins=bins, color=self.mono_color, alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(mono), bottom=i, lw=0, zorder=0) bottom_nearly_mono = 0 if len(nearly_mono) > 0: bottom_nearly_mono, b, p = ax.hist(nearly_mono, bins=bins, color=self.near_mono_color, alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(nearly_mono), bottom=i + bottom_mono, lw=0, zorder=0) if len(poly) > 0: ax.hist(poly, bins=bins, color=self.poly_color, alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(poly), bottom=i + bottom_mono + bottom_nearly_mono, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1, lw=1, edgecolors='black') # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('Relative Evolutionary Divergence') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('Rank (no. taxa)') ax.set_yticks(range(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive if not self.skip_mpld3: mpld3.plugins.clear(self.fig) mpld3.plugins.connect( self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.plugins.connect(self.fig, AxisReplacer(rank_labels)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi) self.fig.savefig(plot_file.replace('.png', '.svg'), dpi=self.dpi)
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [ np_median(dists) for taxon, dists in medians_for_taxa[rank].items() if taxon in taxa_for_dist_inference ] if not v: # not taxa at rank suitable for creating classification # boundaries continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if 1.0 > boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] no_inference = [] for clade_label, dists in medians_for_taxa[rank].items(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if self._is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(md) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(md) else: c.append((0.0, 0.0, 1.0)) mono.append(md) # histogram for each rank n = 0 if len(mono) > 0: mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) w = float( len(mono)) / (len(mono) + len(poly) + len(no_inference)) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max( np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * ( 1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(list(range(0, len(medians_for_taxa)))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect( self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def _percent_correct_plot(self, rel_dists, taxa_for_dist_inference, output_prefix): """Create plots showing correctly classified taxa for different relative distance values. Parameters ---------- rel_dists : d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to consider when inferring relative divergence thresholds. output_prefix : str Prefix for plots. """ print '' print ' Relative divergence thresholds (rank, threshold, parent taxa, child taxa):' ranks = sorted(rel_dists.keys()) rel_dist_thresholds = [] for i in xrange(ranks[0], ranks[-1]): parent_rank = i child_rank = i + 1 # determine classification results for relative divergence # values between the medians of adjacent taxonomic ranks parent_rds = [] for taxa, rd in rel_dists[parent_rank].iteritems(): if taxa in taxa_for_dist_inference: parent_rds.append(rd) parent_p50 = np_percentile(parent_rds, 50) child_rds = [] for taxa, rd in rel_dists[child_rank].iteritems(): if taxa in taxa_for_dist_inference: child_rds.append(rd) child_p50 = np_percentile(child_rds, 50) r = [] y_parent = [] y_child = [] y_mean_corr = [] for test_r in np_linspace(parent_p50, child_p50, 100): parent_cor = float(sum([1 for rd in parent_rds if rd <= test_r])) / len(parent_rds) child_cor = float(sum([1 for rd in child_rds if rd > test_r])) / len(child_rds) r.append(test_r) y_parent.append(parent_cor) y_child.append(child_cor) y_mean_corr.append(0.5 * parent_cor + 0.5 * child_cor) # create plot of correctly classified taxa self.fig.clear() self.fig.set_size_inches(6, 6) ax = self.fig.add_subplot(111) ax.plot(r, y_parent, 'k--', label=Taxonomy.rank_labels[i]) ax.plot(r, y_child, 'k:', label=Taxonomy.rank_labels[i + 1]) ax.plot(r, y_mean_corr, 'r-', label='mean') legend = ax.legend(loc='upper left') legend.draw_frame(False) # find maximum of mean correct classification max_mean = max(y_mean_corr) r_max_values = [r[i] for i, rd in enumerate(y_mean_corr) if rd == max_mean] r_max_value = np_mean(r_max_values) # Note: this will fail if there are multiple local maxima print ' %s\t%.3f\t%d\t%d' % (Taxonomy.rank_labels[parent_rank], r_max_value, len(parent_rds), len(child_rds)) # check that there is a single local maximum rd_indices = [i for i, rd in enumerate(y_mean_corr) if rd == max_mean] for rd_index in xrange(0, len(rd_indices) - 1): if rd_indices[rd_index] != rd_indices[rd_index + 1] - 1: print '[Warning] There are multiple local maxima, so estimated relative divergence threshold will be invalid.' rel_dist_thresholds.append(r_max_value) y_min, _y_max = ax.get_ylim() ax.axvline(x=r_max_value, ymin=0, ymax=1, color='r', ls='--') ax.text(r_max_value + 0.001, y_min + 0.01, '%.3f' % r_max_value, horizontalalignment='left') ax.set_xlabel('relative distance') ax.set_ylabel('% taxa correctly classified') self.prettify(ax) self.fig.tight_layout(pad=1) self.fig.savefig(output_prefix + '.%s_%s.png' % (Taxonomy.rank_labels[parent_rank], Taxonomy.rank_labels[child_rank]), dpi=96) print '' return rel_dist_thresholds
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].items(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) d = len(mono) + len(poly) + len(no_inference) if d == 0: break w = float(len(mono)) / d n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(range(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file, taxonomy_file, output_prefix, min_children, title): # determine named clades in full tree named_clades = set() tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) for node in tree.preorder_node_iter(): if node.label: taxonomy = node.label.split(';') named_clades.add(taxonomy[-1].strip().split(':')[-1]) print 'Identified %d named clades in full tree.' % len(named_clades) # determine named groups with at least the specified number of children print 'Determining taxa with sufficient named children lineages.' taxon_children = defaultdict(set) groups = defaultdict(list) print taxonomy_file for line in open(taxonomy_file): line_split = line.replace('; ', ';').split() genome_id = line_split[0] taxonomy = [x.strip() for x in line_split[1].split(';')] if len(taxonomy) > rank + 1: taxon_children[taxonomy[rank]].add(taxonomy[rank + 1]) if len(taxonomy) > rank: groups[taxonomy[rank]].append(genome_id) groups_to_consider = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children and taxon in named_clades: groups_to_consider.add(taxon) print 'Assessing distribution over %d groups.' % len( groups_to_consider) # calculate RED for full tree print '' print 'Calculating RED over full tree.' tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups( tree, groups_to_consider, groups) if len(polyphyletic) > 0: print '' print '[Warning] Full tree contains polyphyletic groups.' # calculate RED for dereplicated tree print '' print 'Calculating RED over dereplicated tree.' tree = dendropy.Tree.get_from_path(derep_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups( tree, groups_to_consider, groups) groups_to_consider = groups_to_consider - polyphyletic print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len( groups_to_consider) # calculate RED to each group in each tree print '' rel_dists = defaultdict(list) dist_components = defaultdict(list) for f in os.listdir(input_tree_dir): if not f.endswith('.rooted.tree'): continue print f tree_file = os.path.join(input_tree_dir, f) tree = dendropy.Tree.get_from_path(tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance to named taxa rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups( tree, groups_to_consider, groups) for taxon, dist in rel_dist.iteritems(): rel_dists[taxon].append(dist) dist_components[taxon].append(components[taxon]) # create scatter plot x = [] y = [] xDerep = [] yDerep = [] xFull = [] yFull = [] perc10 = [] perc90 = [] labels = [] fout = open(output_prefix + '.tsv', 'w') fout.write( 'Taxon\tP10\tP90\tP90-P10\tMean RED\tMean dist to parent\tMean dist to leaves\tOriginal RED\tOrigial dist to parent\tOriginal dist to leaves\n' ) for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)): labels.append(taxon + ' (%d)' % (len(rel_dists[taxon]))) rd = rel_dists[taxon] for d in rd: x.append(d) y.append(i + 0.2) p10, p90 = np_percentile(rd, [10, 90]) perc10.append(p10) perc90.append(p90) print taxon, p90 - p10 mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0) derep_x, derep_a, derep_b = derep_dist_components[taxon] fout.write( '%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' % (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x, derep_a, derep_b)) xDerep.append(derep_rel_dist[taxon]) yDerep.append(i) xFull.append(full_rel_dist[taxon]) yFull.append(i) fout.close() self.fig.clear() self.fig.set_size_inches(8, len(rel_dists) * 0.4) ax = self.fig.add_subplot(111) ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s') ax.scatter(xDerep, yDerep, alpha=1.0, s=24, c=(1.0, 0.0, 0.0), marker='s') ax.scatter(xFull, yFull, alpha=1.0, s=24, c=(0.0, 0.0, 1.0), marker='*') for i in xrange(len(labels)): ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-') ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-') # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') if title: ax.set_title(title, size=12) ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('taxa') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(labels) self.prettify(ax) # make plot interactive # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12)) # mpld3.save_html(fig, output_prefix + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(output_prefix + '.png', dpi=300)
def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file, taxonomy_file, output_prefix, min_children, title): # determine named clades in full tree named_clades = set() tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) for node in tree.preorder_node_iter(): if node.label: taxonomy = node.label.split(';') named_clades.add(taxonomy[-1].strip().split(':')[-1]) print 'Identified %d named clades in full tree.' % len(named_clades) # determine named groups with at least the specified number of children print 'Determining taxa with sufficient named children lineages.' taxon_children = defaultdict(set) groups = defaultdict(list) print taxonomy_file for line in open(taxonomy_file): line_split = line.replace('; ', ';').split() genome_id = line_split[0] taxonomy = [x.strip() for x in line_split[1].split(';')] if len(taxonomy) > rank + 1: taxon_children[taxonomy[rank]].add(taxonomy[rank + 1]) if len(taxonomy) > rank: groups[taxonomy[rank]].append(genome_id) groups_to_consider = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children and taxon in named_clades: groups_to_consider.add(taxon) print 'Assessing distribution over %d groups.' % len(groups_to_consider) # calculate relative distance for full tree print '' print 'Calculating relative distance over full tree.' tree = dendropy.Tree.get_from_path(full_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) if len(polyphyletic) > 0: print '' print '[Warning] Full tree contains polyphyletic groups.' # calculate relative distance for dereplicated tree print '' print 'Calculating relative distance over dereplicated tree.' tree = dendropy.Tree.get_from_path(derep_tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) groups_to_consider = groups_to_consider - polyphyletic print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len(groups_to_consider) # calculate relative distance to each group in each tree print '' rel_dists = defaultdict(list) dist_components = defaultdict(list) for f in os.listdir(input_tree_dir): if not f.endswith('.rooted.tree'): continue print f tree_file = os.path.join(input_tree_dir, f) tree = dendropy.Tree.get_from_path(tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance to named taxa rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups) for taxon, dist in rel_dist.iteritems(): rel_dists[taxon].append(dist) dist_components[taxon].append(components[taxon]) # create scatter plot x = [] y = [] xDerep = [] yDerep = [] xFull = [] yFull = [] perc10 = [] perc90 = [] labels = [] fout = open(output_prefix + '.tsv', 'w') fout.write('Taxon\tP10\tP90\tP90-P10\tMean rel. dist\tMean dist to parent\tMean dist to leaves\tOriginal rel. dist.\tOrigial dist to parent\tOriginal dist to leaves\n') for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)): labels.append(taxon + ' (%d)' % (len(rel_dists[taxon]))) rd = rel_dists[taxon] for d in rd: x.append(d) y.append(i + 0.2) p10, p90 = np_percentile(rd, [10, 90]) perc10.append(p10) perc90.append(p90) print taxon, p90 - p10 mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0) derep_x, derep_a, derep_b = derep_dist_components[taxon] fout.write('%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' % (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x, derep_a, derep_b)) xDerep.append(derep_rel_dist[taxon]) yDerep.append(i) xFull.append(full_rel_dist[taxon]) yFull.append(i) fout.close() self.fig.clear() self.fig.set_size_inches(8, len(rel_dists) * 0.4) ax = self.fig.add_subplot(111) ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s') ax.scatter(xDerep, yDerep, alpha=1.0, s=24, c=(1.0, 0.0, 0.0), marker='s') ax.scatter(xFull, yFull, alpha=1.0, s=24, c=(0.0, 0.0, 1.0), marker='*') for i in xrange(len(labels)): ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-') ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-') # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') if title: ax.set_title(title, size=12) ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('taxa') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(labels) self.prettify(ax) # make plot interactive # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12)) # mpld3.save_html(fig, output_prefix + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(output_prefix + '.png', dpi=300)
def _percent_correct_plot(self, rel_dists, taxa_for_dist_inference, output_prefix): """Create plots showing correctly classified taxa for different relative distance values. Parameters ---------- rel_dists : d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to consider when inferring relative divergence thresholds. output_prefix : str Prefix for plots. """ print '' print ' Relative divergence thresholds (rank, threshold, parent taxa, child taxa):' ranks = sorted(rel_dists.keys()) rel_dist_thresholds = [] for i in xrange(ranks[0], ranks[-1]): parent_rank = i child_rank = i + 1 # determine classification results for relative divergence # values between the medians of adjacent taxonomic ranks parent_rds = [] for taxa, rd in rel_dists[parent_rank].iteritems(): if taxa in taxa_for_dist_inference: parent_rds.append(rd) parent_p50 = np_percentile(parent_rds, 50) child_rds = [] for taxa, rd in rel_dists[child_rank].iteritems(): if taxa in taxa_for_dist_inference: child_rds.append(rd) child_p50 = np_percentile(child_rds, 50) r = [] y_parent = [] y_child = [] y_mean_corr = [] for test_r in np_linspace(parent_p50, child_p50, 100): parent_cor = float( sum([1 for rd in parent_rds if rd <= test_r ])) / len(parent_rds) child_cor = float(sum([1 for rd in child_rds if rd > test_r ])) / len(child_rds) r.append(test_r) y_parent.append(parent_cor) y_child.append(child_cor) y_mean_corr.append(0.5 * parent_cor + 0.5 * child_cor) # create plot of correctly classified taxa self.fig.clear() self.fig.set_size_inches(6, 6) ax = self.fig.add_subplot(111) ax.plot(r, y_parent, 'k--', label=Taxonomy.rank_labels[i]) ax.plot(r, y_child, 'k:', label=Taxonomy.rank_labels[i + 1]) ax.plot(r, y_mean_corr, 'r-', label='mean') legend = ax.legend(loc='upper left') legend.draw_frame(False) # find maximum of mean correct classification max_mean = max(y_mean_corr) r_max_values = [ r[i] for i, rd in enumerate(y_mean_corr) if rd == max_mean ] r_max_value = np_mean( r_max_values ) # Note: this will fail if there are multiple local maxima print ' %s\t%.3f\t%d\t%d' % (Taxonomy.rank_labels[parent_rank], r_max_value, len(parent_rds), len(child_rds)) # check that there is a single local maximum rd_indices = [ i for i, rd in enumerate(y_mean_corr) if rd == max_mean ] for rd_index in xrange(0, len(rd_indices) - 1): if rd_indices[rd_index] != rd_indices[rd_index + 1] - 1: print '[Warning] There are multiple local maxima, so estimated relative divergence threshold will be invalid.' rel_dist_thresholds.append(r_max_value) y_min, _y_max = ax.get_ylim() ax.axvline(x=r_max_value, ymin=0, ymax=1, color='r', ls='--') ax.text(r_max_value + 0.001, y_min + 0.01, '%.3f' % r_max_value, horizontalalignment='left') ax.set_xlabel('relative distance') ax.set_ylabel('% taxa correctly classified') self.prettify(ax) self.fig.tight_layout(pad=1) self.fig.savefig(output_prefix + '.%s_%s.png' % (Taxonomy.rank_labels[parent_rank], Taxonomy.rank_labels[child_rank]), dpi=96) print '' return rel_dist_thresholds
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)