def rank_median_rd(self, phylum_rel_dists, taxa_for_dist_inference): """Calculate median relative divergence for each rank. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. """ medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) median_for_rank = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [ np_median(dists) for taxon, dists in medians_for_taxa[rank].items() if taxon in taxa_for_dist_inference ] if v: median_for_rank[rank] = np_median(v) return median_for_rank
def compute(self): """Detect RFI """ median_size = (self.median_size_time, self.median_size_freq) data = self.data - sp_dip.median_filter(self.data, size=median_size) # data1 = np.abs(np.sum(data,0)) # data1 = np.abs(np.median(data,0)) data1 = (np_median(data, 0)) # th = np.percentile(data1, th_prctile) thl = [] for ii in xrange(data1.shape[0] - 9): thl.append(prctile(data1[ii:ii + 10], p=90)) # thl.append(max(data1[ii:ii+10])) th = self.th_k * np_median(thl) for ii in xrange(data1.shape[0]): if data1[ii] > th: z, p_value = sp_normaltest(data[:, ii]) if p_value < self.p_th: if self.is_out_selected('Not_normal'): self.flag_results['Not_normal'].flag_data[:, ii] = 1 else: if self.is_out_selected('Normal'): self.flag_results['Normal'].flag_data[:, ii] = 1 return self.flag_results
def makeBinDist(self, transformedCP, averageCoverages, kmerNormPC1, kmerPCs, contigGCs, contigLengths): """Determine the distribution of the points in this bin The distribution is largely normal, except at the boundaries. """ #print "MBD", self.id, self.binSize self.binSize = self.rowIndices.shape[0] if(0 == np.size(self.rowIndices)): return # get the centroids (self.covMedians, self.covStdevs) = self.getCentroidStats(transformedCP) (self.lengthMean, self.lengthStd) = self.getCentroidStats(contigLengths) self.kValMeanNormPC1 = np_median(kmerPCs[self.rowIndices]) self.kValStdevNormPC1 = np_std(kmerPCs[self.rowIndices]) self.kMedian = np_median(kmerPCs[self.rowIndices], axis=0) self.kStdevs = np_std(kmerPCs[self.rowIndices], axis=0) cvals = self.getAverageCoverageDist(averageCoverages) self.cValMedian = np_around(np_median(cvals), decimals=3) self.cValStdev = np_around(np_std(cvals), decimals=3) self.gcMedian = np_median(contigGCs[self.rowIndices]) self.gcStdev = np_std(contigGCs[self.rowIndices]) # work out the total size self.totalBP = sum([contigLengths[i] for i in self.rowIndices]) # set the acceptance ranges self.makeLimits()
def makeBinDist(self, transformedCP, averageCoverages, kmerNormPC1, kmerPCs, contigGCs, contigLengths): """Determine the distribution of the points in this bin The distribution is largely normal, except at the boundaries. """ #print("MBD", self.id, self.binSize) self.binSize = self.rowIndices.shape[0] if (0 == np.size(self.rowIndices)): return # get the centroids (self.covMedians, self.covStdevs) = self.getCentroidStats(transformedCP) (self.lengthMean, self.lengthStd) = self.getCentroidStats(contigLengths) self.kValMeanNormPC1 = np_median(kmerPCs[self.rowIndices]) self.kValStdevNormPC1 = np_std(kmerPCs[self.rowIndices]) self.kMedian = np_median(kmerPCs[self.rowIndices], axis=0) self.kStdevs = np_std(kmerPCs[self.rowIndices], axis=0) cvals = self.getAverageCoverageDist(averageCoverages) self.cValMedian = np_around(np_median(cvals), decimals=3) self.cValStdev = np_around(np_std(cvals), decimals=3) self.gcMedian = np_median(contigGCs[self.rowIndices]) self.gcStdev = np_std(contigGCs[self.rowIndices]) # work out the total size self.totalBP = sum([contigLengths[i] for i in self.rowIndices]) # set the acceptance ranges self.makeLimits()
def mad(arr): """ Median Absolute Deviation: a "Robust" version of standard deviation. Indices variabililty of the sample. https://en.wikipedia.org/wiki/Median_absolute_deviation """ arr = np.ma.array( arr).compressed() # should be faster to not use masked arrays. med = np_median(arr) return np_median(np_abs(arr - med))
def create_feat_mat_1(graph): CCs = list(nx_clustering(graph).values()) DCs = list(nx_average_neighbor_degree(graph).values()) degrees = [tup[1] for tup in graph.degree()] edge_wts = [tup[2] for tup in graph.edges.data('weight')] A_mat = nx_to_numpy_matrix(graph) svs = np_linalg_svd(A_mat, full_matrices=False, compute_uv=False) if len(svs) >= 3: sv1 = svs[0] sv2 = svs[1] sv3 = svs[2] elif len(svs) >= 2: sv1 = svs[0] sv2 = svs[1] sv3 = 0 else: sv1 = svs[0] sv2 = sv3 = 0 feat_mat = np_vstack( (nx_density(graph), nx_number_of_nodes(graph), max(degrees), np_mean(degrees), np_median(degrees), np_var(degrees), max(CCs), np_mean(CCs), np_var(CCs), np_mean(edge_wts), max(edge_wts), np_var(edge_wts), np_mean(DCs), np_var(DCs), max(DCs), sv1, sv2, sv3)).T return feat_mat
def _write_rd_tree(self, tree, rel_node_dists, output_tree): """Write out tree with RED specified at each internal node.""" # copy tree so node labels aren't changed in original tree red_tree = copy.deepcopy(tree) for node_id, n in enumerate(red_tree.preorder_node_iter()): if n == red_tree.seed_node: red = 0 else: red = np_median(rel_node_dists[node_id]) red_str = "|RED={:.3f}".format(red) if n.is_leaf(): n.taxon.label += red_str else: if n.label: n.label += red_str else: n.label = red_str red_tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def diff_on_off(on_off, i_start=0, i_stop=-1): if i_start < 0: i_start = len(on_off) + i_start if i_stop < 0: i_stop = len(on_off) + i_stop n_samp = [] for ii in xrange(i_start, i_stop): n_samp.append(on_off[on_off.keys()[ii]].data[DATA_KEYS[0]].shape[0]) n_samp = min(n_samp) print n_samp data_diff = {} for dkey in DATA_KEYS: data_diff[dkey] = [] for ii in xrange(i_start, i_stop, 2): for dkey in DATA_KEYS: print( str(ii) + ": " + on_off[on_off.keys()[ii]].state + " " + on_off[on_off.keys()[ii + 1]].state) data_diff[dkey].append( on_off[on_off.keys()[ii]].data[dkey][:n_samp, :] - on_off[on_off.keys()[ii + 1]].data[dkey][:n_samp, :]) for dkey in DATA_KEYS: data_diff[dkey] = np_median(np_array(data_diff[dkey]), 0) return data_diff
def aggregateResources(self, nbins=20): """ returns a json object which contains max, min, mean, median, and the histogram itself for all memories/cpu WARNING: this method is not particularly efficient and shouldn't be used lightly! """ allData = {"memory": {"data": []}, "cpu": {"data": []}} query = JobInstance.objects.filter(job=self).only("cpu").only("memory") if query.count(): for inst in query: agg = inst.aggregateResources() for key in ['cpu', 'memory']: if len(agg[key]): allData[key]['data'].append(max(agg[key])) del query # finished aggregation, now we can do calculations for key in allData: d = allData[key]["data"] allData[key]["max"] = max(d) allData[key]["min"] = min(d) arr = np_array(d, dtype=float) allData[key]["mean"] = float(np_mean(arr, axis=0)) allData[key]["median"] = float(np_median(arr, axis=0)) hist, bins = np_hist(arr, nbins) center = (bins[:-1] + bins[1:]) / 2 w = (bins[1] - bins[0]) histo = np_array([center, hist]) allData[key]['histogram'] = { "histo": histo.tolist(), "histoT": histo.T.tolist(), "binWidth": float(w) } del allData[key]['data'] return dumps(allData)
def noise_dwt(cls, coeff, w): """Return the estimation of the DWT components noise level coeff: DWT coefficients w: pywt wavelet object """ n_boot = 1000 k_th = 10 k_std = 1. / np_sqrt(2) std_l = [] std_a = np_zeros(n_boot) wcomp = cls.wavecomp(coeff, w, len(coeff) - 1) for ii in xrange(n_boot): std_a[ii] = np_std(bootstrap_resample(wcomp, 10)) stdv = np_median(std_a) std_l.append(stdv) for ll in xrange(len(coeff) - 2, 0, -1): stdv = stdv * k_std std_l.append(stdv) std_l.append(0) std_l.reverse() return np_array(std_l) * k_th
def test_median(): for dtype in NUMERIC_TYPES: for shape in ((10,), (10, 11), (10, 11, 12)): X = (100 * (np.random.random(shape) - .5)).astype(dtype) for a in range(X.ndim): assert_array_equal(_median(X, axis=a).squeeze(), np_median(X.astype(np.float64), axis=a))
def test_median(): for dtype in NUMERIC_TYPES: for shape in ((10, ), (10, 11), (10, 11, 12)): X = (100 * (np.random.random(shape) - .5)).astype(dtype) for a in range(X.ndim): assert_array_equal( _median(X, axis=a).squeeze(), np_median(X.astype(np.float64), axis=a))
def _median_rank_rd(self, tree, placed_taxon, taxonomy, trusted_taxa_file, min_children, min_support): """Calculate median relative divergence to each node and thresholds for each taxonomic rank. Parameters ---------- tree : Tree Dendropy Tree. placed_taxon : set Taxon currently placed in tree which can be used for relative divergence inference. taxonomy: d[taxon_id] -> taxonomy info Taxonomic information for extant taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. Returns ------- d[rank_index] -> float Median relative divergence for each taxonomic rank. """ # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support) taxa_for_dist_inference.intersection_update(placed_taxon) # infer distribution outliers = Outliers() phylum_rel_dists, rel_node_dists = outliers.median_rd_over_phyla(tree, taxa_for_dist_inference, taxonomy) median_for_rank = outliers.rank_median_rd(phylum_rel_dists, taxa_for_dist_inference) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) return median_for_rank
def rank_median_rd(self, phylum_rel_dists, taxa_for_dist_inference): """Calculate median relative divergence for each rank. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. """ medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) median_for_rank = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference] median_for_rank[rank] = np_median(v) return median_for_rank
def getCentroidStats(self, profile): """Calculate the centroids of the profile""" working_list = profile[self.rowIndices] # return the mean and stdev # we divide by std so we need to make sure it's never 0 tmp_stds = np_std(working_list, axis=0) mean_std = np_mean(tmp_stds) try: std = np_array([x if x != 0 else mean_std for x in tmp_stds]) except: std = mean_std return (np_median(working_list,axis=0), std)
def getCentroidStats(self, profile): """Calculate the centroids of the profile""" working_list = profile[self.rowIndices] # return the mean and stdev # we divide by std so we need to make sure it's never 0 tmp_stds = np_std(working_list, axis=0) mean_std = np_mean(tmp_stds) try: std = np_array([x if x != 0 else mean_std for x in tmp_stds]) except: std = mean_std return (np_median(working_list, axis=0), std)
def rep_genome_stats(self, clusters, genome_files): """Calculate statistics relative to representative genome.""" self.logger.info('Calculating statistics to cluster representatives:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): if len(cids) == 0: stats[rid] = self.RepStats(min_ani=-1, mean_ani=-1, std_ani=-1, median_ani=-1) else: # calculate ANI to representative genome gid_pairs = [] for cid in cids: gid_pairs.append((cid, rid)) gid_pairs.append((rid, cid)) if True: # *** DEBUGGING ani_af = self.fastani.pairs(gid_pairs, genome_files, report_progress=False) else: ani_af = self.fastani.ani_cache # calculate statistics anis = [FastANI.symmetric_ani(ani_af, cid, rid)[ 0] for cid in cids] stats[rid] = self.RepStats(min_ani=min(anis), mean_ani=np_mean(anis), std_ani=np_std(anis), median_ani=np_median(anis)) statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % ( idx+1, len(clusters), float((idx+1)*100)/len(clusters)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') return stats
def compute(self): """Detect RFI """ m_ind = {'L': 0, 'R': 1, 'Q': 2, 'U': 3} if self.num_of_rms_above_median[0] <= self.num_of_rms_above_median[ 1] or self.num_of_rms_above_median[ 0] >= self.num_of_rms_above_median[2]: raise ValueError return 0 for lab in self.flag_results.keys(): med = np_median(self.data[m_ind[lab]][self.data[m_ind[lab]] > 10]) rms = np_sqrt(((self.data[m_ind[lab]][self.data[m_ind[lab]] > 10] - med)**2).mean()) res = sp_threshold(self.data[m_ind[lab]] - med, threshmin=self.num_of_rms_above_median[0] * rms, newval=0) res[res > 0] = 1 self.flag_results[lab].pola = lab self.flag_results[lab].flag_data = res return self.flag_results
def _rep_genome_stats(self, clusters, genome_files): """Calculate statistics relative to representative genome.""" self.logger.info('Calculating statistics to cluster representatives:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): if len(cids) == 0: stats[rid] = self.RepStats(min_ani = -1, mean_ani = -1, std_ani = -1, median_ani = -1) else: # calculate ANI to representative genome gid_pairs = [] for cid in cids: gid_pairs.append((cid, rid)) ani_af = self.ani_cache.fastani_pairs(gid_pairs, genome_files, report_progress=False) # calculate statistics anis = [ani_af[cid][rid][0] for cid in cids] stats[rid] = self.RepStats(min_ani = min(anis), mean_ani = np_mean(anis), std_ani = np_std(anis), median_ani = np_median(anis)) statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % ( idx+1, len(clusters), float((idx+1)*100)/len(clusters)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') return stats
def _median_summary_outlier_file(self, phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, outlier_table, rank_file, verbose_table): """Identify outliers relative to the median of rank distributions. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. gtdb_parent_ranks: d[taxon] -> string indicating parent taxa Parent taxa for each taxon. outlier_table : str Desired name of output table. rank_file : str Desired name of file indicating median relative distance of each rank. verbose_table : boolean Print additional columns in output table. """ # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # determine median relative distance for each rank median_for_rank = self.rank_median_rd(phylum_rel_dists, taxa_for_dist_inference) with open(rank_file, 'w') as fout_rank: median_str = [] for rank in sorted(median_for_rank.keys()): median_str.append('"' + Taxonomy.rank_labels[rank] + '":' + str(median_for_rank[rank])) fout_rank.write('{' + ','.join(median_str) + '}\n') fout = open(outlier_table, 'w') if verbose_table: fout.write('Taxa\tGTDB taxonomy\tMedian distance') fout.write('\tMedian of rank\tMedian difference') fout.write('\tClosest rank\tClassifciation\n') else: fout.write( 'Taxa\tGTDB taxonomy\tMedian distance\tMedian difference\tClosest rank\tClassification\n' ) for rank in sorted(median_for_rank.keys()): for clade_label, dists in medians_for_taxa[rank].items(): dists = np_array(dists) taxon_median = np_median(dists) delta = taxon_median - median_for_rank[rank] closest_rank_dist = 1e10 for test_rank, test_median in median_for_rank.items(): abs_dist = abs(taxon_median - test_median) if abs_dist < closest_rank_dist: closest_rank_dist = abs_dist closest_rank = Taxonomy.rank_labels[test_rank] classification = "OK" if delta < -0.2: classification = "very overclassified" elif delta < -0.1: classification = "overclassified" elif delta > 0.2: classification = "very underclassified" elif delta > 0.1: classification = "underclassified" if verbose_table: fout.write( '%s\t%s\t%.2f\t%.3f\t%.3f\t%s\t%s\n' % (clade_label, ';'.join(gtdb_parent_ranks[clade_label]), taxon_median, median_for_rank[rank], delta, closest_rank, classification)) else: fout.write( '%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label, ';'.join(gtdb_parent_ranks[clade_label]), taxon_median, delta, closest_rank, classification)) fout.close()
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [ np_median(dists) for taxon, dists in medians_for_taxa[rank].items() if taxon in taxa_for_dist_inference ] if not v: # not taxa at rank suitable for creating classification # boundaries continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if 1.0 > boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] no_inference = [] for clade_label, dists in medians_for_taxa[rank].items(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if self._is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(md) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(md) else: c.append((0.0, 0.0, 1.0)) mono.append(md) # histogram for each rank n = 0 if len(mono) > 0: mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) w = float( len(mono)) / (len(mono) + len(poly) + len(no_inference)) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max( np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * ( 1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(list(range(0, len(medians_for_taxa)))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect( self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def pairwise_stats(self, clusters, genome_files): """Calculate statistics for all pairwise comparisons in a species cluster.""" self.logger.info( f'Restricting pairwise comparisons to {self.max_genomes_for_stats:,} randomly selected genomes.') self.logger.info( 'Calculating statistics for all pairwise comparisons in a species cluster:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): statusStr = '-> Processing {:,} of {:,} ({:2f}%) clusters (size = {:,}).'.ljust(86).format( idx+1, len(clusters), float((idx+1)*100)/len(clusters), len(cids)) sys.stdout.write('{}\r'.format(statusStr)) sys.stdout.flush() if len(cids) == 0: stats[rid] = self.PairwiseStats(min_ani=-1, mean_ani=-1, std_ani=-1, median_ani=-1, ani_to_medoid=-1, mean_ani_to_medoid=-1, mean_ani_to_rep=-1, ani_below_95=-1) else: if len(cids) > self.max_genomes_for_stats: cids = set(random.sample(cids, self.max_genomes_for_stats)) # calculate ANI to representative genome gid_pairs = [] gids = list(cids.union([rid])) for gid1, gid2 in combinations(gids, 2): gid_pairs.append((gid1, gid2)) gid_pairs.append((gid2, gid1)) if True: # ***DEBUGGING ani_af = self.fastani.pairs(gid_pairs, genome_files, report_progress=False) else: ani_af = self.fastani.ani_cache # calculate medoid point if len(gids) > 2: dist_mat = np_zeros((len(gids), len(gids))) for i, gid1 in enumerate(gids): for j, gid2 in enumerate(gids): if i < j: ani, _af = FastANI.symmetric_ani( ani_af, gid1, gid2) dist_mat[i, j] = 100 - ani dist_mat[j, i] = 100 - ani medoid_idx = np_argmin(dist_mat.sum(axis=0)) medoid_gid = gids[medoid_idx] else: # with only 2 genomes in a cluster, the representative is the # natural medoid at least for reporting statistics for the # individual species cluster medoid_gid = rid mean_ani_to_medoid = np_mean([FastANI.symmetric_ani(ani_af, gid, medoid_gid)[0] for gid in gids if gid != medoid_gid]) mean_ani_to_rep = np_mean([FastANI.symmetric_ani(ani_af, gid, rid)[0] for gid in gids if gid != rid]) if mean_ani_to_medoid < mean_ani_to_rep: self.logger.error('mean_ani_to_medoid < mean_ani_to_rep') sys.exit(-1) # calculate statistics anis = [] for gid1, gid2 in combinations(gids, 2): ani, _af = FastANI.symmetric_ani(ani_af, gid1, gid2) anis.append(ani) stats[rid] = self.PairwiseStats( min_ani=min(anis), mean_ani=np_mean(anis), std_ani=np_std(anis), median_ani=np_median(anis), ani_to_medoid=FastANI.symmetric_ani( ani_af, rid, medoid_gid)[0], mean_ani_to_medoid=mean_ani_to_medoid, mean_ani_to_rep=mean_ani_to_rep, ani_below_95=sum([1 for ani in anis if ani < 95])) sys.stdout.write('\n') return stats
def run(self, scaffold_stats): """Calculate statistics for genomes. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. """ self.logger.info( "Calculating statistics for {:,} genomes over {:,} scaffolds.". format(scaffold_stats.num_genomes(), scaffold_stats.num_scaffolds())) self.coverage_headers = scaffold_stats.coverage_headers self.signature_headers = scaffold_stats.signature_headers genome_size = defaultdict(int) scaffold_length = defaultdict(list) gc = defaultdict(list) coverage = defaultdict(list) signature = defaultdict(list) for _scaffold_id, stats in scaffold_stats.stats.items(): if stats.genome_id == scaffold_stats.unbinned: continue genome_size[stats.genome_id] += stats.length scaffold_length[stats.genome_id].append(stats.length) gc[stats.genome_id].append(stats.gc) coverage[stats.genome_id].append(stats.coverage) signature[stats.genome_id].append(stats.signature) # record statistics for each genome genomic_signature = GenomicSignature(0) self.genome_stats = {} for genome_id in genome_size: # calculate weighted mean and median statistics weights = np_array(scaffold_length[genome_id]) len_array = np_array(scaffold_length[genome_id]) mean_len = ws.numpy_weighted_mean(len_array, weights) median_len = ws.numpy_weighted_median(len_array, weights) gc_array = np_array(gc[genome_id]) mean_gc = ws.numpy_weighted_mean(gc_array, weights) median_gc = ws.numpy_weighted_median(gc_array, weights) cov_array = np_array(coverage[genome_id]).T mean_cov = ws.numpy_weighted_mean(cov_array, weights) median_cov = [] for i in range(cov_array.shape[0]): median_cov.append( ws.numpy_weighted_median(cov_array[i, :], weights)) signature_array = np_array(signature[genome_id]).T mean_signature = ws.numpy_weighted_mean(signature_array, weights) # calculate mean and median tetranucleotide distance td = [] for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]: stats = scaffold_stats.stats[scaffold_id] td.append( genomic_signature.manhattan(stats.signature, mean_signature)) self.genome_stats[genome_id] = self.GenomeStats( genome_size[genome_id], mean_len, median_len, mean_gc, median_gc, mean_cov, median_cov, mean_signature, np_mean(td), np_median(td)) return self.genome_stats
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, highlight_polyphyly, highlight_taxa_file, trusted_taxa_file, fixed_root, min_children, min_support, mblet, fmeasure_table, min_fmeasure, fmeasure_mono, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree and file self.logger.info('Reading taxonomy.') taxonomy = Taxonomy().read(taxonomy_file) tree_taxonomy = Taxonomy().read_from_tree(input_tree, warnings=False) gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # read F-measure for taxa fmeasure = None if fmeasure_table: fmeasure = self.read_fmeasure(fmeasure_table) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support, fmeasure, min_fmeasure) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) else: # plot every taxon defined in tree taxa_to_plot = set() for node in tree.preorder_node_iter(): support, taxon, _auxiliary_info = parse_label(node.label) if taxon: taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) taxa_to_plot.add(taxon) if False: # HACK FOR NCBI: only plot taxa with >= 2 taxa taxa_to_plot = set() for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) # count number of subordinate children rank_prefix = taxon[0:3] if min_children > 0 and rank_prefix != 's__': child_rank_index = Taxonomy().rank_index[rank_prefix] + 1 child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index] subordinate_taxa = set() for leaf in node.leaf_iter(): taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) if len(taxa) > child_rank_index: sub_taxon = taxa[child_rank_index] if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix): subordinate_taxa.add(sub_taxon) if len(subordinate_taxa) < min_children: continue taxa_to_plot.add(taxon) # highlight taxa highlight_taxa = set() if highlight_taxa_file: for line in open(highlight_taxa_file): highlight_taxa.add(line.strip().split('\t')[0]) # check if a single fixed root should be used if fixed_root or mblet: self.logger.info('Using single fixed rooting for inferring distributions.') if not mblet: rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) else: rel_dists = self.mblet(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # report number of taxa at each rank print '' print 'Rank\tTaxa to Plot\tTaxa for Inference' for rank, taxa in rel_dists.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # *** determine phyla for inferring distribution if True: phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference) else: phyla_for_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, 2, min_support, fmeasure, min_fmeasure) phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, phyla_for_inference) print '' print 'Phyla for RED Inference:' print ','.join(phylum_rel_dists) phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name) fout = open(phyla_file, 'w') for p in phylum_rel_dists: fout.write(p + '\n') fout.close() # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.iteritems(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name) self._write_rd(tree, output_rd_file) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, trusted_taxa_file, fixed_root, min_children, min_support, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) gtdb_parent_ranks = Taxonomy().parents(taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # check if a single fixed root should be used if fixed_root: self.logger.info('Using single fixed rooting for inferring distributions.') rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # report number of taxa at each rank print('') print('Rank\tTaxa to Plot\tTaxa for Inference') for rank, taxa in rel_dists.items(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))) print('') phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference, taxonomy) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.items(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name) self._write_rd(tree, output_rd_file) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def _median_outlier_file(self, rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, output_file): """Identify outliers relative to the median of rank distributions. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. gtdb_parent_ranks: d[taxon] -> string indicating parent taxa Parent taxa for each taxon. output_file : str Desired name of output table. """ # determine median relative distance for each rank median_rel_dist = {} for rank, d in rel_dists.items(): v = [dist for taxa, dist in d.items() if taxa in taxa_for_dist_inference] if len(v) == 0: continue median_rel_dist[rank] = np_median(v) fout = open(output_file, 'w') fout.write('Taxa\tGTDB taxonomy\tMedian distance\tMean difference\tClosest rank\tClassification\n') for i, rank in enumerate(sorted(rel_dists.keys())): for clade_label, dist in rel_dists[rank].items(): if rank in median_rel_dist: delta = dist - median_rel_dist[rank] closest_rank_dist = 1e10 for test_rank, test_median in median_rel_dist.items(): abs_dist = abs(dist - test_median) if abs_dist < closest_rank_dist: closest_rank_dist = abs_dist closest_rank = Taxonomy.rank_labels[test_rank] classification = "OK" if delta < -0.2: classification = "very overclassified" elif delta < -0.1: classification = "overclassified" elif delta > 0.2: classification = "very underclassified" elif delta > 0.1: classification = "underclassified" fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label, ';'.join(gtdb_parent_ranks[clade_label]), dist, delta, closest_rank, classification)) else: fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label, ';'.join(gtdb_parent_ranks[clade_label]), dist, -1, 'NA', 'Insufficent data to calcualte median for rank.')) fout.close()
def _pairwise_stats(self, clusters, genome_files): """Calculate statistics for all pairwise comparisons in a species cluster.""" self.logger.info('Calculating statistics for all pairwise comparisons in a species cluster:') stats = {} for idx, (rid, cids) in enumerate(clusters.items()): statusStr = '-> Processing %d of %d (%.2f%%) clusters (size = %d).'.ljust(86) % ( idx+1, len(clusters), float((idx+1)*100)/len(clusters), len(cids)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if len(cids) == 0: stats[rid] = self.PairwiseStats(min_ani = -1, mean_ani = -1, std_ani = -1, median_ani = -1, ani_to_medoid = -1, mean_ani_to_medoid = -1, ani_below_95 = -1) else: if len(cids) > self.max_genomes_for_stats: cids = set(random.sample(cids, self.max_genomes_for_stats)) # calculate ANI to representative genome gid_pairs = [] gids = list(cids.union([rid])) for gid1, gid2 in combinations(gids, 2): gid_pairs.append((gid1, gid2)) gid_pairs.append((gid2, gid1)) ani_af = self.ani_cache.fastani_pairs(gid_pairs, genome_files, report_progress=False) # calculate medoid point if len(gids) > 2: dist_mat = np_zeros((len(gids), len(gids))) for i, gid1 in enumerate(gids): for j, gid2 in enumerate(gids): if i < j: ani, af = symmetric_ani(ani_af, gid1, gid2) dist_mat[i, j] = ani dist_mat[j, i] = ani medoid_idx = np_argmin(dist_mat.sum(axis=0)) medoid_gid = gids[medoid_idx] else: # with only 2 genomes in a cluster, the representative is the # natural medoid at least for reporting statistics for the # individual species cluster medoid_gid = rid mean_ani_to_medoid = np_mean([symmetric_ani(ani_af, gid, medoid_gid)[0] for gid in gids if gid != medoid_gid]) # calculate statistics anis = [] for gid1, gid2 in combinations(gids, 2): ani, af = symmetric_ani(ani_af, gid1, gid2) anis.append(ani) stats[rid] = self.PairwiseStats(min_ani = min(anis), mean_ani = np_mean(anis), std_ani = np_std(anis), median_ani = np_median(anis), ani_to_medoid = symmetric_ani(ani_af, rid, medoid_gid)[0], mean_ani_to_medoid = mean_ani_to_medoid, ani_below_95 = sum([1 for ani in anis if ani < 95])) sys.stdout.write('\n') return stats
def _calculate_red_distances(self, input_tree, out_dir): """ Provide a taxonomy string to a user genome based on the reference genomes of the same clade. If the clade contains multiple reference genomes we are comparing their taxonomies. -If all reference genomes have the same taxonomy up to the 'closest rank' , the taxonomy string including the closest rank is returned. -If **NOT** all reference genomes have the same taxonomy up to the 'closest rank', the taxonomy string **NOT** including the closest rank is returned. Parameters ---------- list_subnode : list of leaf nodes including multiple reference genome. closest_rank : last rank of the reference taxonomy gtdb_taxonomy : dictionary storing all the reference taxonomies Returns ------- string Taxonomy string. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(Config.TAXONOMY_FILE) # determine taxa to be used for inferring distribution trusted_taxa = None taxa_for_dist_inference = self._filter_taxa_for_dist_inference( tree, taxonomy, trusted_taxa, Config.RED_MIN_CHILDREN, Config.RED_MIN_SUPPORT) phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla( tree, taxa_for_dist_inference, taxonomy) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: # This can occur since we are setting all nodes # to their median RED value. #self.logger.warning('Not all branches are positive after scaling.') pass n.edge_length = rd_to_parent if False: # These plots can be useful for debugging and internal use, # but are likely to be confusing to users. rd = RelativeDistance() input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] plot_file = os.path.join(out_dir, '%s.png' % input_tree_name) rd._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file) gtdb_parent_ranks = Taxonomy().parents(taxonomy) median_outlier_table = os.path.join(out_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(out_dir, '%s.dict' % input_tree_name) rd._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, False) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] output_tree = os.path.join(out_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) return tree
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, fmeasure, fmeasure_mono, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference] if not v: # not taxa at rank suitable for creating classification boundaries continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) #ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.0, 0.0, 1.0), lw=2, zorder=2) #ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.1, 0.1]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (0.0, 0.0, 0.0) else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label.capitalize() + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] near_mono = [] for clade_label, dists in medians_for_taxa[rank].iteritems(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if ((highlight_polyphyly and fmeasure[clade_label] < fmeasure_mono) or clade_label in highlight_taxa): c.append((1.0,0.0,0.0)) poly.append(md) elif (highlight_polyphyly and fmeasure[clade_label] != 1.0): c.append((255.0/255,187.0/255,120.0/255)) near_mono.append(md) else: c.append((152.0/255,223.0/255,138.0/255)) mono.append(md) # histogram for each rank binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) max_bin_count = max(np_histogram(mono + near_mono + poly, bins=bins)[0]) mono_bottom = 0 near_mono_bottom = 0 mono = np_array(mono) near_mono = np_array(near_mono) poly = np_array(poly) if len(mono) > 0: mono_bottom, b, p = ax.hist(mono, bins=bins, color=(152.0/255,223.0/255,138.0/255), alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(mono), bottom=i, lw=0, zorder=0) if len(near_mono) > 0: near_mono_bottom, b, p = ax.hist(near_mono, bins=bins, color=(255.0/255,187.0/255,120.0/255), alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(near_mono), bottom=i + mono_bottom, lw=0, zorder=0) if len(poly) > 0: ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.5, weights=0.9 * (1.0 / max_bin_count) * np_ones_like(poly), bottom=i + mono_bottom + near_mono_bottom, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('Relative Evolutionary Divergence') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('Rank (no. taxa)') ax.set_yticks(xrange(0, len(medians_for_taxa))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi) self.fig.savefig(plot_file.replace('.png', '.svg'), dpi=self.dpi)
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] no_inference = [] for clade_label, dists in medians_for_taxa[rank].iteritems(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(md) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(md) else: c.append((0.0, 0.0, 1.0)) mono.append(md) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(medians_for_taxa))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, trusted_taxa_file, fixed_root, min_children, min_support, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) gtdb_parent_ranks = Taxonomy().parents(taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # check if a single fixed root should be used if fixed_root: self.logger.info('Using single fixed rooting for inferring distributions.') rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # report number of taxa at each rank print '' print 'Rank\tTaxa to Plot\tTaxa for Inference' for rank, taxa in rel_dists.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference, taxonomy) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.iteritems(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def _median_outlier_file(self, rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, output_file): """Identify outliers relative to the median of rank distributions. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. gtdb_parent_ranks: d[taxon] -> string indicating parent taxa Parent taxa for each taxon. output_file : str Desired name of output table. """ # determine median relative distance for each rank median_rel_dist = {} for rank, d in rel_dists.iteritems(): v = [dist for taxa, dist in d.iteritems() if taxa in taxa_for_dist_inference] if len(v) == 0: continue median_rel_dist[rank] = np_median(v) fout = open(output_file, 'w') fout.write('Taxa\tGTDB taxonomy\tMedian distance\tMean difference\tClosest rank\tClassification\n') for i, rank in enumerate(sorted(rel_dists.keys())): for clade_label, dist in rel_dists[rank].iteritems(): if rank in median_rel_dist: delta = dist - median_rel_dist[rank] closest_rank_dist = 1e10 for test_rank, test_median in median_rel_dist.iteritems(): abs_dist = abs(dist - test_median) if abs_dist < closest_rank_dist: closest_rank_dist = abs_dist closest_rank = Taxonomy.rank_labels[test_rank] classification = "OK" if delta < -0.2: classification = "very overclassified" elif delta < -0.1: classification = "overclassified" elif delta > 0.2: classification = "very underclassified" elif delta > 0.1: classification = "underclassified" fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label, ';'.join(gtdb_parent_ranks[clade_label]), dist, delta, closest_rank, classification)) else: fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label, ';'.join(gtdb_parent_ranks[clade_label]), dist, -1, 'NA', 'Insufficent data to calcualte median for rank.')) fout.close()
def _median_summary_outlier_file(self, phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, outlier_table, rank_file, verbose_table): """Identify outliers relative to the median of rank distributions. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. gtdb_parent_ranks: d[taxon] -> string indicating parent taxa Parent taxa for each taxon. outlier_table : str Desired name of output table. rank_file : str Desired name of file indicating median relative distance of each rank. verbose_table : boolean Print additional columns in output table. """ # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # determine median relative distance for each rank median_for_rank = self.rank_median_rd(phylum_rel_dists, taxa_for_dist_inference) fout_rank = open(rank_file, 'w') median_str = [] for rank in sorted(median_for_rank.keys()): median_str.append('"' + Taxonomy.rank_labels[rank] + '":' + str(median_for_rank[rank])) fout_rank.write('{' + ','.join(median_str) + '}\n') fout_rank.close() fout = open(outlier_table, 'w') if verbose_table: fout.write('Taxa\tGTDB taxonomy\tMedian distance') fout.write('\tMedian of rank\tMedian difference') fout.write('\tClosest rank\tClassifciation\n') else: fout.write('Taxa\tGTDB taxonomy\tMedian distance\tMedian difference\tClosest rank\tClassification\n') for rank in sorted(median_for_rank.keys()): for clade_label, dists in medians_for_taxa[rank].iteritems(): dists = np_array(dists) taxon_median = np_median(dists) delta = taxon_median - median_for_rank[rank] closest_rank_dist = 1e10 for test_rank, test_median in median_for_rank.iteritems(): abs_dist = abs(taxon_median - test_median) if abs_dist < closest_rank_dist: closest_rank_dist = abs_dist closest_rank = Taxonomy.rank_labels[test_rank] classification = "OK" if delta < -0.2: classification = "very overclassified" elif delta < -0.1: classification = "overclassified" elif delta > 0.2: classification = "very underclassified" elif delta > 0.1: classification = "underclassified" if verbose_table: fout.write('%s\t%s\t%.2f\t%.3f\t%.3f\t%s\t%s\n' % (clade_label, ';'.join(gtdb_parent_ranks[clade_label]), taxon_median, median_for_rank[rank], delta, closest_rank, classification)) else: fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label, ';'.join(gtdb_parent_ranks[clade_label]), taxon_median, delta, closest_rank, classification)) fout.close()
for i in orig_list3: more, less = 0, 0 for j in orig_list3: if i > j: more += 1 elif i < j: less += 1 if more == less: median = i break print('Медиана:', median) from numpy import median as np_median print('Проверим по numpy:', np_median(orig_list3)) def gnome_sort(orig_list): i = 1 while i < len(orig_list): if not i or orig_list[i - 1] <= orig_list[i]: i += 1 else: orig_list[i], orig_list[i - 1] = orig_list[i - 1], orig_list[i] i -= 1 return orig_list print('Гномьей сортировкой по возрастанию:', gnome_sort(orig_list3)) print('m-й элемент:', gnome_sort(orig_list3)[m])
def create_feat_mat(graph_list, n_feats): dens_pos = [nx_density(graph) for graph in graph_list] nodes_pos = [nx_number_of_nodes(graph) for graph in graph_list] # CC statistics - mean and max - faster to use a big loop mostly CC_mean = [] CC_mean_append = CC_mean.append CC_max = [] CC_max_append = CC_max.append CC_var = [] CC_var_append = CC_var.append # Degree correlation - avg degree of the neighborhood DC_mean = [] DC_mean_append = DC_mean.append DC_max = [] DC_max_append = DC_max.append DC_var = [] DC_var_append = DC_var.append # Degree statistics degree_mean = [] degree_mean_append = degree_mean.append degree_max = [] degree_max_append = degree_max.append degree_median = [] degree_median_append = degree_median.append degree_var = [] degree_var_append = degree_var.append # Edge weight statistics edge_wt_mean = [] edge_wt_mean_append = edge_wt_mean.append edge_wt_max = [] edge_wt_max_append = edge_wt_max.append edge_wt_var = [] edge_wt_var_append = edge_wt_var.append # First 3 singular values sv1 = [] sv1_append = sv1.append sv2 = [] sv2_append = sv2.append sv3 = [] sv3_append = sv3.append for graph in graph_list: CCs = list(nx_clustering(graph).values()) CC_max_append(max(CCs)) CC_mean_append(np_mean(CCs)) CC_var_append(np_var(CCs)) DCs = list(nx_average_neighbor_degree(graph).values()) DC_max_append(max(DCs)) DC_mean_append(np_mean(DCs)) DC_var_append(np_var(DCs)) degrees = [tup[1] for tup in graph.degree()] degree_mean_append(np_mean(degrees)) degree_median_append(np_median(degrees)) degree_max_append(max(degrees)) degree_var_append(np_var(degrees)) edge_wts = [tup[2] for tup in graph.edges.data('weight')] edge_wt_mean_append(np_mean(edge_wts)) edge_wt_var_append(np_var(edge_wts)) edge_wt_max_append(max(edge_wts)) A_mat = nx_to_numpy_matrix(graph) svs = np_linalg_svd(A_mat, full_matrices=False, compute_uv=False) if len(svs) >= 3: sv1_append(svs[0]) sv2_append(svs[1]) sv3_append(svs[2]) elif len(svs) >= 2: sv1_append(svs[0]) sv2_append(svs[1]) sv3_append(0) else: sv1_append(svs[0]) sv2_append(0) sv3_append(0) feat_mat = np_vstack((dens_pos, nodes_pos, degree_max, degree_mean, degree_median, degree_var, CC_max, CC_mean, CC_var, edge_wt_mean, edge_wt_max, edge_wt_var, DC_mean, DC_var, DC_max, sv1, sv2, sv3)).T if n_feats == 1: feat_mat = np_array(dens_pos).reshape(-1, 1) return feat_mat