Esempio n. 1
0
    def rank_median_rd(self, phylum_rel_dists, taxa_for_dist_inference):
        """Calculate median relative divergence for each rank.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        """

        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        median_for_rank = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [
                np_median(dists)
                for taxon, dists in medians_for_taxa[rank].items()
                if taxon in taxa_for_dist_inference
            ]

            if v:
                median_for_rank[rank] = np_median(v)

        return median_for_rank
Esempio n. 2
0
    def compute(self):
        """Detect RFI
        """
        median_size = (self.median_size_time, self.median_size_freq)

        data = self.data - sp_dip.median_filter(self.data, size=median_size)
        #    data1 = np.abs(np.sum(data,0))
        #    data1 = np.abs(np.median(data,0))
        data1 = (np_median(data, 0))
        #    th = np.percentile(data1, th_prctile)
        thl = []
        for ii in xrange(data1.shape[0] - 9):
            thl.append(prctile(data1[ii:ii + 10], p=90))

    #       thl.append(max(data1[ii:ii+10]))

        th = self.th_k * np_median(thl)
        for ii in xrange(data1.shape[0]):
            if data1[ii] > th:
                z, p_value = sp_normaltest(data[:, ii])
                if p_value < self.p_th:
                    if self.is_out_selected('Not_normal'):
                        self.flag_results['Not_normal'].flag_data[:, ii] = 1
                else:
                    if self.is_out_selected('Normal'):
                        self.flag_results['Normal'].flag_data[:, ii] = 1
        return self.flag_results
Esempio n. 3
0
    def makeBinDist(self, transformedCP, averageCoverages, kmerNormPC1, kmerPCs, contigGCs, contigLengths):
        """Determine the distribution of the points in this bin

        The distribution is largely normal, except at the boundaries.
        """
        #print "MBD", self.id, self.binSize
        self.binSize = self.rowIndices.shape[0]
        if(0 == np.size(self.rowIndices)):
            return

        # get the centroids
        (self.covMedians, self.covStdevs) = self.getCentroidStats(transformedCP)
        (self.lengthMean, self.lengthStd) = self.getCentroidStats(contigLengths)

        self.kValMeanNormPC1 = np_median(kmerPCs[self.rowIndices])
        self.kValStdevNormPC1 = np_std(kmerPCs[self.rowIndices])

        self.kMedian = np_median(kmerPCs[self.rowIndices], axis=0)
        self.kStdevs = np_std(kmerPCs[self.rowIndices], axis=0)

        cvals = self.getAverageCoverageDist(averageCoverages)
        self.cValMedian = np_around(np_median(cvals), decimals=3)
        self.cValStdev = np_around(np_std(cvals), decimals=3)

        self.gcMedian = np_median(contigGCs[self.rowIndices])
        self.gcStdev = np_std(contigGCs[self.rowIndices])

        # work out the total size
        self.totalBP = sum([contigLengths[i] for i in self.rowIndices])

        # set the acceptance ranges
        self.makeLimits()
Esempio n. 4
0
    def makeBinDist(self, transformedCP, averageCoverages, kmerNormPC1,
                    kmerPCs, contigGCs, contigLengths):
        """Determine the distribution of the points in this bin

        The distribution is largely normal, except at the boundaries.
        """
        #print("MBD", self.id, self.binSize)
        self.binSize = self.rowIndices.shape[0]
        if (0 == np.size(self.rowIndices)):
            return

        # get the centroids
        (self.covMedians,
         self.covStdevs) = self.getCentroidStats(transformedCP)
        (self.lengthMean,
         self.lengthStd) = self.getCentroidStats(contigLengths)

        self.kValMeanNormPC1 = np_median(kmerPCs[self.rowIndices])
        self.kValStdevNormPC1 = np_std(kmerPCs[self.rowIndices])

        self.kMedian = np_median(kmerPCs[self.rowIndices], axis=0)
        self.kStdevs = np_std(kmerPCs[self.rowIndices], axis=0)

        cvals = self.getAverageCoverageDist(averageCoverages)
        self.cValMedian = np_around(np_median(cvals), decimals=3)
        self.cValStdev = np_around(np_std(cvals), decimals=3)

        self.gcMedian = np_median(contigGCs[self.rowIndices])
        self.gcStdev = np_std(contigGCs[self.rowIndices])

        # work out the total size
        self.totalBP = sum([contigLengths[i] for i in self.rowIndices])

        # set the acceptance ranges
        self.makeLimits()
Esempio n. 5
0
def mad(arr):
    """ Median Absolute Deviation: a "Robust" version of standard deviation.
        Indices variabililty of the sample.
        https://en.wikipedia.org/wiki/Median_absolute_deviation
    """
    arr = np.ma.array(
        arr).compressed()  # should be faster to not use masked arrays.
    med = np_median(arr)
    return np_median(np_abs(arr - med))
def create_feat_mat_1(graph):
    CCs = list(nx_clustering(graph).values())

    DCs = list(nx_average_neighbor_degree(graph).values())

    degrees = [tup[1] for tup in graph.degree()]

    edge_wts = [tup[2] for tup in graph.edges.data('weight')]

    A_mat = nx_to_numpy_matrix(graph)
    svs = np_linalg_svd(A_mat, full_matrices=False, compute_uv=False)

    if len(svs) >= 3:
        sv1 = svs[0]
        sv2 = svs[1]
        sv3 = svs[2]
    elif len(svs) >= 2:
        sv1 = svs[0]
        sv2 = svs[1]
        sv3 = 0
    else:
        sv1 = svs[0]
        sv2 = sv3 = 0

    feat_mat = np_vstack(
        (nx_density(graph), nx_number_of_nodes(graph), max(degrees),
         np_mean(degrees), np_median(degrees), np_var(degrees), max(CCs),
         np_mean(CCs), np_var(CCs), np_mean(edge_wts), max(edge_wts),
         np_var(edge_wts), np_mean(DCs), np_var(DCs), max(DCs), sv1, sv2,
         sv3)).T

    return feat_mat
Esempio n. 7
0
    def _write_rd_tree(self, tree, rel_node_dists, output_tree):
        """Write out tree with RED specified at each internal node."""

        # copy tree so node labels aren't changed in original tree
        red_tree = copy.deepcopy(tree)

        for node_id, n in enumerate(red_tree.preorder_node_iter()):
            if n == red_tree.seed_node:
                red = 0
            else:
                red = np_median(rel_node_dists[node_id])

            red_str = "|RED={:.3f}".format(red)
            if n.is_leaf():
                n.taxon.label += red_str
            else:
                if n.label:
                    n.label += red_str
                else:
                    n.label = red_str

        red_tree.write_to_path(output_tree,
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)
Esempio n. 8
0
def diff_on_off(on_off, i_start=0, i_stop=-1):

    if i_start < 0:
        i_start = len(on_off) + i_start

    if i_stop < 0:
        i_stop = len(on_off) + i_stop

    n_samp = []
    for ii in xrange(i_start, i_stop):
        n_samp.append(on_off[on_off.keys()[ii]].data[DATA_KEYS[0]].shape[0])
    n_samp = min(n_samp)
    print n_samp
    data_diff = {}
    for dkey in DATA_KEYS:
        data_diff[dkey] = []

    for ii in xrange(i_start, i_stop, 2):
        for dkey in DATA_KEYS:
            print(
                str(ii) + ": " + on_off[on_off.keys()[ii]].state + " " +
                on_off[on_off.keys()[ii + 1]].state)
            data_diff[dkey].append(
                on_off[on_off.keys()[ii]].data[dkey][:n_samp, :] -
                on_off[on_off.keys()[ii + 1]].data[dkey][:n_samp, :])

    for dkey in DATA_KEYS:
        data_diff[dkey] = np_median(np_array(data_diff[dkey]), 0)
    return data_diff
Esempio n. 9
0
 def aggregateResources(self, nbins=20):
     """ returns a json object which contains max, min, mean, median, 
         and the histogram itself for all memories/cpu 
         WARNING: this method is not particularly efficient 
         and shouldn't be used lightly!
     """
     allData = {"memory": {"data": []}, "cpu": {"data": []}}
     query = JobInstance.objects.filter(job=self).only("cpu").only("memory")
     if query.count():
         for inst in query:
             agg = inst.aggregateResources()
             for key in ['cpu', 'memory']:
                 if len(agg[key]):
                     allData[key]['data'].append(max(agg[key]))
         del query
         # finished aggregation, now we can do calculations
         for key in allData:
             d = allData[key]["data"]
             allData[key]["max"] = max(d)
             allData[key]["min"] = min(d)
             arr = np_array(d, dtype=float)
             allData[key]["mean"] = float(np_mean(arr, axis=0))
             allData[key]["median"] = float(np_median(arr, axis=0))
             hist, bins = np_hist(arr, nbins)
             center = (bins[:-1] + bins[1:]) / 2
             w = (bins[1] - bins[0])
             histo = np_array([center, hist])
             allData[key]['histogram'] = {
                 "histo": histo.tolist(),
                 "histoT": histo.T.tolist(),
                 "binWidth": float(w)
             }
             del allData[key]['data']
     return dumps(allData)
Esempio n. 10
0
    def noise_dwt(cls, coeff, w):
        """Return the estimation of the DWT components noise level

        coeff: DWT coefficients
        w: pywt wavelet object
        """
        n_boot = 1000
        k_th = 10
        k_std = 1. / np_sqrt(2)
        std_l = []
        std_a = np_zeros(n_boot)
        wcomp = cls.wavecomp(coeff, w, len(coeff) - 1)

        for ii in xrange(n_boot):
            std_a[ii] = np_std(bootstrap_resample(wcomp, 10))

        stdv = np_median(std_a)
        std_l.append(stdv)
        for ll in xrange(len(coeff) - 2, 0, -1):
            stdv = stdv * k_std
            std_l.append(stdv)
        std_l.append(0)

        std_l.reverse()
        return np_array(std_l) * k_th
Esempio n. 11
0
def test_median():
    for dtype in NUMERIC_TYPES:
        for shape in ((10,), (10, 11), (10, 11, 12)):
            X = (100 * (np.random.random(shape) - .5)).astype(dtype)
            for a in range(X.ndim):
                assert_array_equal(_median(X, axis=a).squeeze(),
                                   np_median(X.astype(np.float64), axis=a))
Esempio n. 12
0
def test_median():
    for dtype in NUMERIC_TYPES:
        for shape in ((10, ), (10, 11), (10, 11, 12)):
            X = (100 * (np.random.random(shape) - .5)).astype(dtype)
            for a in range(X.ndim):
                assert_array_equal(
                    _median(X, axis=a).squeeze(),
                    np_median(X.astype(np.float64), axis=a))
Esempio n. 13
0
    def _median_rank_rd(self, 
                            tree, 
                            placed_taxon, 
                            taxonomy,
                            trusted_taxa_file, 
                            min_children, 
                            min_support):
        """Calculate median relative divergence to each node and thresholds for each taxonomic rank.
        
        Parameters
        ----------
        tree : Tree
          Dendropy Tree.
        placed_taxon : set
          Taxon currently placed in tree which can be used for relative divergence inference.
        taxonomy: d[taxon_id] -> taxonomy info
          Taxonomic information for extant taxa.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        
        Returns
        -------
        d[rank_index] -> float
          Median relative divergence for each taxonomic rank.
        """
                      
        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
            
        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, 
                                                                    taxonomy, 
                                                                    trusted_taxa, 
                                                                    min_children, 
                                                                    min_support)
        taxa_for_dist_inference.intersection_update(placed_taxon)
 
        # infer distribution                                        
        outliers = Outliers()
        phylum_rel_dists, rel_node_dists = outliers.median_rd_over_phyla(tree, 
                                                                            taxa_for_dist_inference, 
                                                                            taxonomy)    
        median_for_rank = outliers.rank_median_rd(phylum_rel_dists, 
                                                    taxa_for_dist_inference)
                                                    
        # set edge lengths to median value over all rootings
        tree.seed_node.rel_dist = 0.0
        for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            n.rel_dist = np_median(rel_node_dists[n.id])
            
        return median_for_rank
Esempio n. 14
0
    def _median_rank_rd(self, 
                            tree, 
                            placed_taxon, 
                            taxonomy,
                            trusted_taxa_file, 
                            min_children, 
                            min_support):
        """Calculate median relative divergence to each node and thresholds for each taxonomic rank.
        
        Parameters
        ----------
        tree : Tree
          Dendropy Tree.
        placed_taxon : set
          Taxon currently placed in tree which can be used for relative divergence inference.
        taxonomy: d[taxon_id] -> taxonomy info
          Taxonomic information for extant taxa.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        
        Returns
        -------
        d[rank_index] -> float
          Median relative divergence for each taxonomic rank.
        """
                      
        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
            
        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, 
                                                                    taxonomy, 
                                                                    trusted_taxa, 
                                                                    min_children, 
                                                                    min_support)
        taxa_for_dist_inference.intersection_update(placed_taxon)
 
        # infer distribution                                        
        outliers = Outliers()
        phylum_rel_dists, rel_node_dists = outliers.median_rd_over_phyla(tree, 
                                                                            taxa_for_dist_inference, 
                                                                            taxonomy)    
        median_for_rank = outliers.rank_median_rd(phylum_rel_dists, 
                                                    taxa_for_dist_inference)
                                                    
        # set edge lengths to median value over all rootings
        tree.seed_node.rel_dist = 0.0
        for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            n.rel_dist = np_median(rel_node_dists[n.id])
            
        return median_for_rank
Esempio n. 15
0
 def rank_median_rd(self, phylum_rel_dists, taxa_for_dist_inference):
     """Calculate median relative divergence for each rank.
     
     Parameters
     ----------
     phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
         Relative divergence of taxon at each rank for different phylum-level rootings.
     taxa_for_dist_inference : iterable
         Taxa to considered when inferring distributions.
     """
     
     medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)
 
     median_for_rank = {}
     for i, rank in enumerate(sorted(medians_for_taxa.keys())):
             v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference]
             median_for_rank[rank] = np_median(v)
             
     return median_for_rank
Esempio n. 16
0
 def getCentroidStats(self, profile):
     """Calculate the centroids of the profile"""
     working_list = profile[self.rowIndices]
     
     # return the mean and stdev
     # we divide by std so we need to make sure it's never 0
     tmp_stds = np_std(working_list, axis=0)
     mean_std = np_mean(tmp_stds)
     try:
         std = np_array([x if x != 0 else mean_std for x in tmp_stds])
     except:
         std = mean_std
     return (np_median(working_list,axis=0), std)
Esempio n. 17
0
    def getCentroidStats(self, profile):
        """Calculate the centroids of the profile"""
        working_list = profile[self.rowIndices]

        # return the mean and stdev
        # we divide by std so we need to make sure it's never 0
        tmp_stds = np_std(working_list, axis=0)
        mean_std = np_mean(tmp_stds)
        try:
            std = np_array([x if x != 0 else mean_std for x in tmp_stds])
        except:
            std = mean_std
        return (np_median(working_list, axis=0), std)
    def rep_genome_stats(self, clusters, genome_files):
        """Calculate statistics relative to representative genome."""

        self.logger.info('Calculating statistics to cluster representatives:')
        stats = {}
        for idx, (rid, cids) in enumerate(clusters.items()):
            if len(cids) == 0:
                stats[rid] = self.RepStats(min_ani=-1,
                                           mean_ani=-1,
                                           std_ani=-1,
                                           median_ani=-1)
            else:
                # calculate ANI to representative genome
                gid_pairs = []
                for cid in cids:
                    gid_pairs.append((cid, rid))
                    gid_pairs.append((rid, cid))

                if True:  # *** DEBUGGING
                    ani_af = self.fastani.pairs(gid_pairs,
                                                genome_files,
                                                report_progress=False)
                else:
                    ani_af = self.fastani.ani_cache

                # calculate statistics
                anis = [FastANI.symmetric_ani(ani_af, cid, rid)[
                    0] for cid in cids]

                stats[rid] = self.RepStats(min_ani=min(anis),
                                           mean_ani=np_mean(anis),
                                           std_ani=np_std(anis),
                                           median_ani=np_median(anis))

            statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % (
                idx+1,
                len(clusters),
                float((idx+1)*100)/len(clusters))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

        sys.stdout.write('\n')

        return stats
Esempio n. 19
0
 def compute(self):
     """Detect RFI
     """
     m_ind = {'L': 0, 'R': 1, 'Q': 2, 'U': 3}
     if self.num_of_rms_above_median[0] <= self.num_of_rms_above_median[
             1] or self.num_of_rms_above_median[
                 0] >= self.num_of_rms_above_median[2]:
         raise ValueError
         return 0
     for lab in self.flag_results.keys():
         med = np_median(self.data[m_ind[lab]][self.data[m_ind[lab]] > 10])
         rms = np_sqrt(((self.data[m_ind[lab]][self.data[m_ind[lab]] > 10] -
                         med)**2).mean())
         res = sp_threshold(self.data[m_ind[lab]] - med,
                            threshmin=self.num_of_rms_above_median[0] * rms,
                            newval=0)
         res[res > 0] = 1
         self.flag_results[lab].pola = lab
         self.flag_results[lab].flag_data = res
     return self.flag_results
Esempio n. 20
0
 def _rep_genome_stats(self, clusters, genome_files):
     """Calculate statistics relative to representative genome."""
     
     self.logger.info('Calculating statistics to cluster representatives:')
     stats = {}
     for idx, (rid, cids) in enumerate(clusters.items()):
         if len(cids) == 0:
             stats[rid] = self.RepStats(min_ani = -1,
                                         mean_ani = -1,
                                         std_ani = -1,
                                         median_ani = -1)
         else:
             # calculate ANI to representative genome
             gid_pairs = []
             for cid in cids:
                 gid_pairs.append((cid, rid))
             ani_af = self.ani_cache.fastani_pairs(gid_pairs, 
                                                     genome_files, 
                                                     report_progress=False)
             
             # calculate statistics
             anis = [ani_af[cid][rid][0] for cid in cids]
             stats[rid] = self.RepStats(min_ani = min(anis),
                                         mean_ani = np_mean(anis),
                                         std_ani = np_std(anis),
                                         median_ani = np_median(anis))
                                         
         statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % (
                             idx+1, 
                             len(clusters), 
                             float((idx+1)*100)/len(clusters))
         sys.stdout.write('%s\r' % statusStr)
         sys.stdout.flush()
             
     sys.stdout.write('\n')
         
     return stats
Esempio n. 21
0
    def _median_summary_outlier_file(self, phylum_rel_dists,
                                     taxa_for_dist_inference,
                                     gtdb_parent_ranks, outlier_table,
                                     rank_file, verbose_table):
        """Identify outliers relative to the median of rank distributions.
        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        gtdb_parent_ranks: d[taxon] -> string indicating parent taxa
            Parent taxa for each taxon.
        outlier_table : str
            Desired name of output table.
        rank_file : str
            Desired name of file indicating median relative distance of each rank.
        verbose_table : boolean
            Print additional columns in output table.
        """

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # determine median relative distance for each rank
        median_for_rank = self.rank_median_rd(phylum_rel_dists,
                                              taxa_for_dist_inference)

        with open(rank_file, 'w') as fout_rank:
            median_str = []
            for rank in sorted(median_for_rank.keys()):
                median_str.append('"' + Taxonomy.rank_labels[rank] + '":' +
                                  str(median_for_rank[rank]))
            fout_rank.write('{' + ','.join(median_str) + '}\n')

        fout = open(outlier_table, 'w')
        if verbose_table:
            fout.write('Taxa\tGTDB taxonomy\tMedian distance')
            fout.write('\tMedian of rank\tMedian difference')
            fout.write('\tClosest rank\tClassifciation\n')
        else:
            fout.write(
                'Taxa\tGTDB taxonomy\tMedian distance\tMedian difference\tClosest rank\tClassification\n'
            )

        for rank in sorted(median_for_rank.keys()):
            for clade_label, dists in medians_for_taxa[rank].items():
                dists = np_array(dists)

                taxon_median = np_median(dists)
                delta = taxon_median - median_for_rank[rank]

                closest_rank_dist = 1e10
                for test_rank, test_median in median_for_rank.items():
                    abs_dist = abs(taxon_median - test_median)
                    if abs_dist < closest_rank_dist:
                        closest_rank_dist = abs_dist
                        closest_rank = Taxonomy.rank_labels[test_rank]

                classification = "OK"
                if delta < -0.2:
                    classification = "very overclassified"
                elif delta < -0.1:
                    classification = "overclassified"
                elif delta > 0.2:
                    classification = "very underclassified"
                elif delta > 0.1:
                    classification = "underclassified"

                if verbose_table:
                    fout.write(
                        '%s\t%s\t%.2f\t%.3f\t%.3f\t%s\t%s\n' %
                        (clade_label, ';'.join(gtdb_parent_ranks[clade_label]),
                         taxon_median, median_for_rank[rank], delta,
                         closest_rank, classification))
                else:
                    fout.write(
                        '%s\t%s\t%.3f\t%.3f\t%s\t%s\n' %
                        (clade_label, ';'.join(gtdb_parent_ranks[clade_label]),
                         taxon_median, delta, closest_rank, classification))
        fout.close()
Esempio n. 22
0
    def _distribution_summary_plot(self, phylum_rel_dists,
                                   taxa_for_dist_inference, plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [
                np_median(dists)
                for taxon, dists in medians_for_taxa[rank].items()
                if taxon in taxa_for_dist_inference
            ]
            if not v:
                # not taxa at rank suitable for creating classification
                # boundaries
                continue

            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)
            ax.plot((p50, p50), (i, i + 0.5),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)
            ax.plot((p90, p90), (i, i + 0.25),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if 1.0 > boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5),
                            c=c,
                            lw=2,
                            zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label +
                               ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            no_inference = []
            for clade_label, dists in medians_for_taxa[rank].items():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if self._is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(md)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(md)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(md)

            # histogram for each rank
            n = 0
            if len(mono) > 0:
                mono = np_array(mono)
                no_inference = np_array(no_inference)
                poly = np_array(poly)
                binwidth = 0.025
                bins = np_arange(0, 1.0 + binwidth, binwidth)

                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                w = float(
                    len(mono)) / (len(mono) + len(poly) + len(no_inference))
                n, b, p = ax.hist(mono,
                                  bins=bins,
                                  color=(0.0, 0.0, 1.0),
                                  alpha=0.25,
                                  weights=0.9 * w * mono_weights,
                                  bottom=i,
                                  lw=0,
                                  zorder=0)

            if len(no_inference) > 0:
                no_inference_max_count = max(
                    np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (
                    1.0 / no_inference_max_count)

                ax.hist(no_inference,
                        bins=bins,
                        color=(0.3, 0.3, 0.3),
                        alpha=0.25,
                        weights=0.9 * (1.0 - w) * no_inference_weights,
                        bottom=i + n,
                        lw=0,
                        zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly,
                        bins=bins,
                        color=(1.0, 0.0, 0.0),
                        alpha=0.25,
                        weights=0.9 * (1.0 - w) * poly_weights,
                        bottom=i + n,
                        lw=0,
                        zorder=0)

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(list(range(0, len(medians_for_taxa))))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(
            self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig,
                              mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
    def pairwise_stats(self, clusters, genome_files):
        """Calculate statistics for all pairwise comparisons in a species cluster."""

        self.logger.info(
            f'Restricting pairwise comparisons to {self.max_genomes_for_stats:,} randomly selected genomes.')
        self.logger.info(
            'Calculating statistics for all pairwise comparisons in a species cluster:')

        stats = {}
        for idx, (rid, cids) in enumerate(clusters.items()):
            statusStr = '-> Processing {:,} of {:,} ({:2f}%) clusters (size = {:,}).'.ljust(86).format(
                idx+1,
                len(clusters),
                float((idx+1)*100)/len(clusters),
                len(cids))
            sys.stdout.write('{}\r'.format(statusStr))
            sys.stdout.flush()

            if len(cids) == 0:
                stats[rid] = self.PairwiseStats(min_ani=-1,
                                                mean_ani=-1,
                                                std_ani=-1,
                                                median_ani=-1,
                                                ani_to_medoid=-1,
                                                mean_ani_to_medoid=-1,
                                                mean_ani_to_rep=-1,
                                                ani_below_95=-1)
            else:
                if len(cids) > self.max_genomes_for_stats:
                    cids = set(random.sample(cids, self.max_genomes_for_stats))

                # calculate ANI to representative genome
                gid_pairs = []
                gids = list(cids.union([rid]))
                for gid1, gid2 in combinations(gids, 2):
                    gid_pairs.append((gid1, gid2))
                    gid_pairs.append((gid2, gid1))

                if True:  # ***DEBUGGING
                    ani_af = self.fastani.pairs(gid_pairs,
                                                genome_files,
                                                report_progress=False)
                else:
                    ani_af = self.fastani.ani_cache

                # calculate medoid point
                if len(gids) > 2:
                    dist_mat = np_zeros((len(gids), len(gids)))
                    for i, gid1 in enumerate(gids):
                        for j, gid2 in enumerate(gids):
                            if i < j:
                                ani, _af = FastANI.symmetric_ani(
                                    ani_af, gid1, gid2)
                                dist_mat[i, j] = 100 - ani
                                dist_mat[j, i] = 100 - ani

                    medoid_idx = np_argmin(dist_mat.sum(axis=0))
                    medoid_gid = gids[medoid_idx]
                else:
                    # with only 2 genomes in a cluster, the representative is the
                    # natural medoid at least for reporting statistics for the
                    # individual species cluster
                    medoid_gid = rid

                mean_ani_to_medoid = np_mean([FastANI.symmetric_ani(ani_af, gid, medoid_gid)[0]
                                              for gid in gids if gid != medoid_gid])

                mean_ani_to_rep = np_mean([FastANI.symmetric_ani(ani_af, gid, rid)[0]
                                           for gid in gids if gid != rid])

                if mean_ani_to_medoid < mean_ani_to_rep:
                    self.logger.error('mean_ani_to_medoid < mean_ani_to_rep')
                    sys.exit(-1)

                # calculate statistics
                anis = []
                for gid1, gid2 in combinations(gids, 2):
                    ani, _af = FastANI.symmetric_ani(ani_af, gid1, gid2)
                    anis.append(ani)

                stats[rid] = self.PairwiseStats(
                    min_ani=min(anis),
                    mean_ani=np_mean(anis),
                    std_ani=np_std(anis),
                    median_ani=np_median(anis),
                    ani_to_medoid=FastANI.symmetric_ani(
                        ani_af, rid, medoid_gid)[0],
                    mean_ani_to_medoid=mean_ani_to_medoid,
                    mean_ani_to_rep=mean_ani_to_rep,
                    ani_below_95=sum([1 for ani in anis if ani < 95]))

        sys.stdout.write('\n')

        return stats
Esempio n. 24
0
    def run(self, scaffold_stats):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        """

        self.logger.info(
            "Calculating statistics for {:,} genomes over {:,} scaffolds.".
            format(scaffold_stats.num_genomes(),
                   scaffold_stats.num_scaffolds()))

        self.coverage_headers = scaffold_stats.coverage_headers
        self.signature_headers = scaffold_stats.signature_headers

        genome_size = defaultdict(int)
        scaffold_length = defaultdict(list)
        gc = defaultdict(list)
        coverage = defaultdict(list)
        signature = defaultdict(list)
        for _scaffold_id, stats in scaffold_stats.stats.items():
            if stats.genome_id == scaffold_stats.unbinned:
                continue

            genome_size[stats.genome_id] += stats.length
            scaffold_length[stats.genome_id].append(stats.length)
            gc[stats.genome_id].append(stats.gc)
            coverage[stats.genome_id].append(stats.coverage)
            signature[stats.genome_id].append(stats.signature)

        # record statistics for each genome
        genomic_signature = GenomicSignature(0)

        self.genome_stats = {}
        for genome_id in genome_size:
            # calculate weighted mean and median statistics
            weights = np_array(scaffold_length[genome_id])

            len_array = np_array(scaffold_length[genome_id])
            mean_len = ws.numpy_weighted_mean(len_array, weights)
            median_len = ws.numpy_weighted_median(len_array, weights)

            gc_array = np_array(gc[genome_id])
            mean_gc = ws.numpy_weighted_mean(gc_array, weights)
            median_gc = ws.numpy_weighted_median(gc_array, weights)

            cov_array = np_array(coverage[genome_id]).T
            mean_cov = ws.numpy_weighted_mean(cov_array, weights)
            median_cov = []
            for i in range(cov_array.shape[0]):
                median_cov.append(
                    ws.numpy_weighted_median(cov_array[i, :], weights))

            signature_array = np_array(signature[genome_id]).T
            mean_signature = ws.numpy_weighted_mean(signature_array, weights)

            # calculate mean and median tetranucleotide distance
            td = []
            for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]:
                stats = scaffold_stats.stats[scaffold_id]
                td.append(
                    genomic_signature.manhattan(stats.signature,
                                                mean_signature))

            self.genome_stats[genome_id] = self.GenomeStats(
                genome_size[genome_id], mean_len, median_len, mean_gc,
                median_gc, mean_cov, median_cov, mean_signature, np_mean(td),
                np_median(td))

        return self.genome_stats
Esempio n. 25
0
    def run(self, input_tree, 
                    taxonomy_file, 
                    output_dir, 
                    plot_taxa_file,
                    plot_dist_taxa_only,
                    plot_domain,
                    highlight_polyphyly,
                    highlight_taxa_file,
                    trusted_taxa_file,
                    fixed_root,
                    min_children, 
                    min_support,
                    mblet,
                    fmeasure_table,
                    min_fmeasure,
                    fmeasure_mono,
                    verbose_table):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
          Name of input tree.
        taxonomy_file : str
          File with taxonomy strings for each taxa.
        output_dir : str
          Desired output directory.
        plot_taxa_file : str
          File specifying taxa to plot. Set to None to consider all taxa.
        plot_dist_taxa_only : boolean    
          Only plot the taxa used to infer distribution.
        plot_domain : boolean
          Plot domain rank.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        fixed_root : boolean
          Usa a single fixed root to infer outliers.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        verbose_table : boolean
          Print additional columns in output table.
        """
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree and file
        self.logger.info('Reading taxonomy.')
        taxonomy = Taxonomy().read(taxonomy_file)
        tree_taxonomy = Taxonomy().read_from_tree(input_tree,
                                                    warnings=False)
            
        gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
            
        # read F-measure for taxa
        fmeasure = None
        if fmeasure_table:
            fmeasure = self.read_fmeasure(fmeasure_table)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, 
                                                                    taxonomy, 
                                                                    trusted_taxa, 
                                                                    min_children, 
                                                                    min_support,
                                                                    fmeasure,
                                                                    min_fmeasure)

        # limit plotted taxa
        taxa_to_plot = None
        if plot_dist_taxa_only:
            taxa_to_plot = taxa_for_dist_inference
        elif plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)
        else:
            # plot every taxon defined in tree
            taxa_to_plot = set()
            for node in tree.preorder_node_iter():
                support, taxon, _auxiliary_info = parse_label(node.label)
                if taxon:
                    taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names 
                                                         # (e.g. p__Armatimonadetes; c__Chthonomonadetes)
                    taxa_to_plot.add(taxon)
            
            if False:
                # HACK FOR NCBI: only plot taxa with >= 2 taxa
                taxa_to_plot = set()
                for node in tree.preorder_node_iter():
                    if not node.label or node.is_leaf():
                        continue

                    support, taxon, _auxiliary_info = parse_label(node.label)
                    if not taxon:
                        continue
                    taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names 
                                                         # (e.g. p__Armatimonadetes; c__Chthonomonadetes)
                  
                    # count number of subordinate children
                    rank_prefix = taxon[0:3]
                    if min_children > 0 and rank_prefix != 's__':
                        child_rank_index = Taxonomy().rank_index[rank_prefix] + 1
                        child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index]
                        subordinate_taxa = set()
                        for leaf in node.leaf_iter():
                            taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes)
                            if len(taxa) > child_rank_index:
                                sub_taxon = taxa[child_rank_index]
                                if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix):
                                    subordinate_taxa.add(sub_taxon)

                        if len(subordinate_taxa) < min_children:
                            continue
                            
                    taxa_to_plot.add(taxon)
            
        # highlight taxa
        highlight_taxa = set()
        if highlight_taxa_file:
            for line in open(highlight_taxa_file):
                highlight_taxa.add(line.strip().split('\t')[0])
                
        # check if a single fixed root should be used
        if fixed_root or mblet:
            self.logger.info('Using single fixed rooting for inferring distributions.')
            if not mblet:
                rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference)
            else:
                rel_dists = self.mblet(tree, taxa_for_dist_inference)
                
            # create fixed rooting style tables and plots
            distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name)
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_plot(rel_dists, 
                                        taxa_for_dist_inference,
                                        highlight_polyphyly,
                                        highlight_taxa,
                                        distribution_table,
                                        fmeasure,
                                        fmeasure_mono,
                                        plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            self._median_outlier_file(rel_dists, 
                                        taxa_for_dist_inference, 
                                        gtdb_parent_ranks, 
                                        median_outlier_table)
        else:
            # calculate relative distance to taxa
            rd = RelativeDistance()
            rel_dists = rd.rel_dist_to_named_clades(tree)
            
            # restrict to taxa of interest
            if taxa_to_plot:
                for r in rel_dists:
                    for k in set(rel_dists[r].keys()) - set(taxa_to_plot):
                        del rel_dists[r][k]
            
            # report number of taxa at each rank
            print ''
            print 'Rank\tTaxa to Plot\tTaxa for Inference'
            for rank, taxa in rel_dists.iteritems():
                taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
                print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
            print ''
        
            # *** determine phyla for inferring distribution
            if True:
                phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                                taxa_for_dist_inference)
            else:                                                                    
                phyla_for_inference = filter_taxa_for_dist_inference(tree, 
                                                                        taxonomy, 
                                                                        trusted_taxa, 
                                                                        2, 
                                                                        min_support,
                                                                        fmeasure,
                                                                        min_fmeasure)
                phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                                phyla_for_inference)
                print ''
                print 'Phyla for RED Inference:'
                print ','.join(phylum_rel_dists)
                phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name)
                fout = open(phyla_file, 'w')
                for p in phylum_rel_dists:
                    fout.write(p + '\n')
                fout.close()
                                                                            
            # set edge lengths to median value over all rootings
            tree.seed_node.rel_dist = 0.0
            for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
                n.rel_dist = np_median(rel_node_dists[n.id])
                rd_to_parent = n.rel_dist - n.parent_node.rel_dist
                if rd_to_parent < 0:
                    self.logger.warning('Not all branches are positive after scaling.')
                n.edge_length = rd_to_parent

            for phylum, rel_dists in phylum_rel_dists.iteritems():
                phylum_dir = os.path.join(output_dir, phylum)
                if not os.path.exists(phylum_dir):
                    os.makedirs(phylum_dir)
                    
                # restrict to taxa of interest
                if taxa_to_plot:
                    for r in rel_dists:
                        for k in set(rel_dists[r].keys()) - set(taxa_to_plot):
                            del rel_dists[r][k]
                    
                # create distribution plot
                distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum)
                plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum)
                self._distribution_plot(rel_dists, 
                                        taxa_for_dist_inference,
                                        highlight_polyphyly,
                                        highlight_taxa,
                                        distribution_table,
                                        fmeasure,
                                        fmeasure_mono,
                                        plot_file)

                median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum)
                self._median_outlier_file(rel_dists, 
                                            taxa_for_dist_inference, 
                                            gtdb_parent_ranks,
                                            median_outlier_table)
   
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_summary_plot(phylum_rel_dists, 
                                            taxa_for_dist_inference,
                                            highlight_polyphyly,
                                            highlight_taxa,
                                            fmeasure,
                                            fmeasure_mono,
                                            plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name)
            self._median_summary_outlier_file(phylum_rel_dists, 
                                                taxa_for_dist_inference, 
                                                gtdb_parent_ranks, 
                                                median_outlier_table, 
                                                median_rank_file, 
                                                verbose_table)

        output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name)
        self._write_rd(tree, output_rd_file)
                                                
        output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name)
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)                
Esempio n. 26
0
    def run(self, input_tree, 
                    taxonomy_file, 
                    output_dir, 
                    plot_taxa_file,
                    plot_dist_taxa_only,
                    plot_domain,
                    trusted_taxa_file,
                    fixed_root,
                    min_children, 
                    min_support,
                    verbose_table):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
          Name of input tree.
        taxonomy_file : str
          File with taxonomy strings for each taxa.
        output_dir : str
          Desired output directory.
        plot_taxa_file : str
          File specifying taxa to plot. Set to None to consider all taxa.
        plot_dist_taxa_only : boolean    
          Only plot the taxa used to infer distribution.
        plot_domain : boolean
          Plot domain rank.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        fixed_root : boolean
          Usa a single fixed root to infer outliers.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        verbose_table : boolean
          Print additional columns in output table.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name)
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            
        gtdb_parent_ranks = Taxonomy().parents(taxonomy)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support)

        # limit plotted taxa
        taxa_to_plot = None
        if plot_dist_taxa_only:
            taxa_to_plot = taxa_for_dist_inference
        elif plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)
            
        # check if a single fixed root should be used
        if fixed_root:
            self.logger.info('Using single fixed rooting for inferring distributions.')
            rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference)

            # create fixed rooting style tables and plots
            distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            self._median_outlier_file(rel_dists, 
                                        taxa_for_dist_inference, 
                                        gtdb_parent_ranks, 
                                        median_outlier_table)
        else:
            # calculate relative distance to taxa
            rd = RelativeDistance()
            rel_dists = rd.rel_dist_to_named_clades(tree)
        
            # report number of taxa at each rank
            print('')
            print('Rank\tTaxa to Plot\tTaxa for Inference')
            for rank, taxa in rel_dists.items():
                taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
                print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)))
            print('')
        
            phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                            taxa_for_dist_inference,
                                                                            taxonomy)
                                                                            
            # set edge lengths to median value over all rootings
            tree.seed_node.rel_dist = 0.0
            for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
                n.rel_dist = np_median(rel_node_dists[n.id])
                rd_to_parent = n.rel_dist - n.parent_node.rel_dist
                if rd_to_parent < 0:
                    self.logger.warning('Not all branches are positive after scaling.')
                n.edge_length = rd_to_parent

            for phylum, rel_dists in phylum_rel_dists.items():
                phylum_dir = os.path.join(output_dir, phylum)
                if not os.path.exists(phylum_dir):
                    os.makedirs(phylum_dir)
                    
                # create distribution plot
                distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum)
                plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum)
                self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file)

                median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum)
                self._median_outlier_file(rel_dists, 
                                            taxa_for_dist_inference, 
                                            gtdb_parent_ranks,
                                            median_outlier_table)
   
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name)
            self._median_summary_outlier_file(phylum_rel_dists, 
                                                taxa_for_dist_inference, 
                                                gtdb_parent_ranks, 
                                                median_outlier_table, 
                                                median_rank_file, 
                                                verbose_table)

        output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name)
        self._write_rd(tree, output_rd_file)
                                                
        output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name)
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)                
Esempio n. 27
0
    def _median_outlier_file(self, 
                                rel_dists,
                                taxa_for_dist_inference,
                                gtdb_parent_ranks, 
                                output_file):
        """Identify outliers relative to the median of rank distributions.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        gtdb_parent_ranks: d[taxon] -> string indicating parent taxa
            Parent taxa for each taxon.
        output_file : str
            Desired name of output table.
        """

        # determine median relative distance for each rank
        median_rel_dist = {}
        for rank, d in rel_dists.items():
            v = [dist for taxa, dist in d.items() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            median_rel_dist[rank] = np_median(v)

        fout = open(output_file, 'w')
        fout.write('Taxa\tGTDB taxonomy\tMedian distance\tMean difference\tClosest rank\tClassification\n')
            
        for i, rank in enumerate(sorted(rel_dists.keys())):
            for clade_label, dist in rel_dists[rank].items():
                if rank in median_rel_dist:
                    delta = dist - median_rel_dist[rank]
                    closest_rank_dist = 1e10
                    for test_rank, test_median in median_rel_dist.items():
                        abs_dist = abs(dist - test_median)
                        if abs_dist < closest_rank_dist:
                            closest_rank_dist = abs_dist
                            closest_rank = Taxonomy.rank_labels[test_rank]

                    classification = "OK"
                    if delta < -0.2:
                        classification = "very overclassified"
                    elif delta < -0.1:
                        classification = "overclassified"
                    elif delta > 0.2:
                        classification = "very underclassified"
                    elif delta > 0.1:
                        classification = "underclassified"

                    fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label,
                                                                   ';'.join(gtdb_parent_ranks[clade_label]),
                                                                   dist,
                                                                   delta,
                                                                   closest_rank,
                                                                   classification))
                else:
                    fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label,
                                                                   ';'.join(gtdb_parent_ranks[clade_label]),
                                                                   dist,
                                                                   -1,
                                                                   'NA',
                                                                   'Insufficent data to calcualte median for rank.'))
        fout.close()
Esempio n. 28
0
    def _pairwise_stats(self, clusters, genome_files):
        """Calculate statistics for all pairwise comparisons in a species cluster."""
        
        self.logger.info('Calculating statistics for all pairwise comparisons in a species cluster:')
        stats = {}
        for idx, (rid, cids) in enumerate(clusters.items()):
            statusStr = '-> Processing %d of %d (%.2f%%) clusters (size = %d).'.ljust(86) % (
                                idx+1, 
                                len(clusters), 
                                float((idx+1)*100)/len(clusters),
                                len(cids))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
                                
            if len(cids) == 0:
                stats[rid] = self.PairwiseStats(min_ani = -1,
                                                mean_ani = -1,
                                                std_ani = -1,
                                                median_ani = -1,
                                                ani_to_medoid = -1,
                                                mean_ani_to_medoid = -1,
                                                ani_below_95 = -1)
            else:
                if len(cids) > self.max_genomes_for_stats:
                    cids = set(random.sample(cids, self.max_genomes_for_stats))
                
                # calculate ANI to representative genome
                gid_pairs = []
                gids = list(cids.union([rid]))
                for gid1, gid2 in combinations(gids, 2):
                    gid_pairs.append((gid1, gid2))
                    gid_pairs.append((gid2, gid1))
                    
                ani_af = self.ani_cache.fastani_pairs(gid_pairs, 
                                                        genome_files, 
                                                        report_progress=False)
                                                        
                # calculate medoid point
                if len(gids) > 2:
                    dist_mat = np_zeros((len(gids), len(gids)))
                    for i, gid1 in enumerate(gids):
                        for j, gid2 in enumerate(gids):
                            if i < j:
                                ani, af = symmetric_ani(ani_af, gid1, gid2)
                                dist_mat[i, j] = ani
                                dist_mat[j, i] = ani

                    medoid_idx = np_argmin(dist_mat.sum(axis=0))
                    medoid_gid = gids[medoid_idx]
                else:
                    # with only 2 genomes in a cluster, the representative is the
                    # natural medoid at least for reporting statistics for the
                    # individual species cluster
                    medoid_gid = rid
                    
                mean_ani_to_medoid = np_mean([symmetric_ani(ani_af, gid, medoid_gid)[0] 
                                                for gid in gids if gid != medoid_gid])

                # calculate statistics
                anis = []
                for gid1, gid2 in combinations(gids, 2):
                    ani, af = symmetric_ani(ani_af, gid1, gid2)
                    anis.append(ani)
                    
                stats[rid] = self.PairwiseStats(min_ani = min(anis),
                                                mean_ani = np_mean(anis),
                                                std_ani = np_std(anis),
                                                median_ani = np_median(anis),
                                                ani_to_medoid = symmetric_ani(ani_af, rid, medoid_gid)[0],
                                                mean_ani_to_medoid = mean_ani_to_medoid,
                                                ani_below_95 = sum([1 for ani in anis if ani < 95]))

        sys.stdout.write('\n')
            
        return stats
Esempio n. 29
0
    def _calculate_red_distances(self, input_tree, out_dir):
        """
        Provide a taxonomy string to a user genome based on the reference genomes of the same clade.
        If the clade contains multiple reference genomes we are comparing their taxonomies.
        -If all reference genomes have the same taxonomy up to the 'closest rank' ,
        the taxonomy string including the closest rank is returned.
        -If **NOT** all reference genomes have the same taxonomy up to the 'closest rank',
        the taxonomy string **NOT** including the closest rank is returned.
       
        Parameters
        ----------
        list_subnode : list of leaf nodes including multiple reference genome.
        closest_rank : last rank of the reference taxonomy
        gtdb_taxonomy : dictionary storing all the reference taxonomies
        
    
        Returns
        -------
        string
            Taxonomy string.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        self.logger.info('Reading taxonomy from file.')
        taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        # determine taxa to be used for inferring distribution
        trusted_taxa = None
        taxa_for_dist_inference = self._filter_taxa_for_dist_inference(
            tree, taxonomy, trusted_taxa, Config.RED_MIN_CHILDREN,
            Config.RED_MIN_SUPPORT)

        phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(
            tree, taxa_for_dist_inference, taxonomy)

        # set edge lengths to median value over all rootings
        tree.seed_node.rel_dist = 0.0
        for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            n.rel_dist = np_median(rel_node_dists[n.id])
            rd_to_parent = n.rel_dist - n.parent_node.rel_dist
            if rd_to_parent < 0:
                # This can occur since we are setting all nodes
                # to their median RED value.
                #self.logger.warning('Not all branches are positive after scaling.')
                pass
            n.edge_length = rd_to_parent

        if False:
            # These plots can be useful for debugging and internal use,
            # but are likely to be confusing to users.
            rd = RelativeDistance()

            input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]
            plot_file = os.path.join(out_dir, '%s.png' % input_tree_name)
            rd._distribution_summary_plot(phylum_rel_dists,
                                          taxa_for_dist_inference, plot_file)

            gtdb_parent_ranks = Taxonomy().parents(taxonomy)
            median_outlier_table = os.path.join(out_dir,
                                                '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(out_dir,
                                            '%s.dict' % input_tree_name)
            rd._median_summary_outlier_file(phylum_rel_dists,
                                            taxa_for_dist_inference,
                                            gtdb_parent_ranks,
                                            median_outlier_table,
                                            median_rank_file, False)

            input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]
            output_tree = os.path.join(out_dir,
                                       '%s.scaled.tree' % input_tree_name)
            tree.write_to_path(output_tree,
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)

        return tree
Esempio n. 30
0
    def _distribution_summary_plot(self, 
                                    phylum_rel_dists, 
                                    taxa_for_dist_inference, 
                                    highlight_polyphyly,
                                    highlight_taxa,
                                    fmeasure,
                                    fmeasure_mono,
                                    plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference]
            if not v:
                # not taxa at rank suitable for creating classification boundaries
                continue
            
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            #ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.0, 0.0, 1.0), lw=2, zorder=2)
            #ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.1, 0.1]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (0.0, 0.0, 0.0)
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label.capitalize() + ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            near_mono = []
            for clade_label, dists in medians_for_taxa[rank].iteritems():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if ((highlight_polyphyly and fmeasure[clade_label] < fmeasure_mono) or clade_label in highlight_taxa):
                    c.append((1.0,0.0,0.0))
                    poly.append(md)
                elif (highlight_polyphyly and fmeasure[clade_label] != 1.0):
                    c.append((255.0/255,187.0/255,120.0/255))
                    near_mono.append(md)
                else:
                    c.append((152.0/255,223.0/255,138.0/255))
                    mono.append(md)

            # histogram for each rank
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)
            max_bin_count = max(np_histogram(mono + near_mono + poly, bins=bins)[0])

            mono_bottom = 0
            near_mono_bottom = 0
            mono = np_array(mono)
            near_mono = np_array(near_mono)
            poly = np_array(poly)
            if len(mono) > 0:
                mono_bottom, b, p = ax.hist(mono, bins=bins,
                          color=(152.0/255,223.0/255,138.0/255),
                          alpha=0.5,
                          weights=0.9 * (1.0 / max_bin_count) * np_ones_like(mono),
                          bottom=i,
                          lw=0,
                          zorder=0)

            if len(near_mono) > 0:
                near_mono_bottom, b, p = ax.hist(near_mono, bins=bins,
                                              color=(255.0/255,187.0/255,120.0/255),
                                              alpha=0.5,
                                              weights=0.9 * (1.0 / max_bin_count) * np_ones_like(near_mono),
                                              bottom=i + mono_bottom,
                                              lw=0,
                                              zorder=0)

            if len(poly) > 0:
                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.5,
                          weights=0.9 * (1.0 / max_bin_count) * np_ones_like(poly),
                          bottom=i + mono_bottom + near_mono_bottom,
                          lw=0,
                          zorder=0)
                          
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('Relative Evolutionary Divergence')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('Rank (no. taxa)')
        ax.set_yticks(xrange(0, len(medians_for_taxa)))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
        self.fig.savefig(plot_file.replace('.png', '.svg'), dpi=self.dpi)
Esempio n. 31
0
    def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            no_inference = []
            for clade_label, dists in medians_for_taxa[rank].iteritems():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(md)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(md)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(md)

            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            mono_max_count = max(np_histogram(mono, bins=bins)[0])
            mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n, b, p = ax.hist(mono, bins=bins,
                      color=(0.0, 0.0, 1.0),
                      alpha=0.25,
                      weights=0.9 * w * mono_weights,
                      bottom=i,
                      lw=0,
                      zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(medians_for_taxa)))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Esempio n. 32
0
    def run(self, input_tree, 
                    taxonomy_file, 
                    output_dir, 
                    plot_taxa_file,
                    plot_dist_taxa_only,
                    plot_domain,
                    trusted_taxa_file,
                    fixed_root,
                    min_children, 
                    min_support,
                    verbose_table):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
          Name of input tree.
        taxonomy_file : str
          File with taxonomy strings for each taxa.
        output_dir : str
          Desired output directory.
        plot_taxa_file : str
          File specifying taxa to plot. Set to None to consider all taxa.
        plot_dist_taxa_only : boolean    
          Only plot the taxa used to infer distribution.
        plot_domain : boolean
          Plot domain rank.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        fixed_root : boolean
          Usa a single fixed root to infer outliers.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        verbose_table : boolean
          Print additional columns in output table.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name)
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            
        gtdb_parent_ranks = Taxonomy().parents(taxonomy)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support)
        
        # limit plotted taxa
        taxa_to_plot = None
        if plot_dist_taxa_only:
            taxa_to_plot = taxa_for_dist_inference
        elif plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)
            
        # check if a single fixed root should be used
        if fixed_root:
            self.logger.info('Using single fixed rooting for inferring distributions.')
            rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference)

            # create fixed rooting style tables and plots
            distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            self._median_outlier_file(rel_dists, 
                                        taxa_for_dist_inference, 
                                        gtdb_parent_ranks, 
                                        median_outlier_table)
        else:
            # calculate relative distance to taxa
            rd = RelativeDistance()
            rel_dists = rd.rel_dist_to_named_clades(tree)
        
            # report number of taxa at each rank
            print ''
            print 'Rank\tTaxa to Plot\tTaxa for Inference'
            for rank, taxa in rel_dists.iteritems():
                taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
                print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
            print ''
        
            phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                            taxa_for_dist_inference,
                                                                            taxonomy)
                                                                            
            # set edge lengths to median value over all rootings
            tree.seed_node.rel_dist = 0.0
            for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
                n.rel_dist = np_median(rel_node_dists[n.id])
                rd_to_parent = n.rel_dist - n.parent_node.rel_dist
                if rd_to_parent < 0:
                    self.logger.warning('Not all branches are positive after scaling.')
                n.edge_length = rd_to_parent

            for phylum, rel_dists in phylum_rel_dists.iteritems():
                phylum_dir = os.path.join(output_dir, phylum)
                if not os.path.exists(phylum_dir):
                    os.makedirs(phylum_dir)
                    
                # create distribution plot
                distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum)
                plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum)
                self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file)

                median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum)
                self._median_outlier_file(rel_dists, 
                                            taxa_for_dist_inference, 
                                            gtdb_parent_ranks,
                                            median_outlier_table)
   
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name)
            self._median_summary_outlier_file(phylum_rel_dists, 
                                                taxa_for_dist_inference, 
                                                gtdb_parent_ranks, 
                                                median_outlier_table, 
                                                median_rank_file, 
                                                verbose_table)
                                                
        output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name)
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)                
Esempio n. 33
0
    def _median_outlier_file(self, 
                                rel_dists,
                                taxa_for_dist_inference,
                                gtdb_parent_ranks, 
                                output_file):
        """Identify outliers relative to the median of rank distributions.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        gtdb_parent_ranks: d[taxon] -> string indicating parent taxa
            Parent taxa for each taxon.
        output_file : str
            Desired name of output table.
        """

        # determine median relative distance for each rank
        median_rel_dist = {}
        for rank, d in rel_dists.iteritems():
            v = [dist for taxa, dist in d.iteritems() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            median_rel_dist[rank] = np_median(v)

        fout = open(output_file, 'w')
        fout.write('Taxa\tGTDB taxonomy\tMedian distance\tMean difference\tClosest rank\tClassification\n')
            
        for i, rank in enumerate(sorted(rel_dists.keys())):
            for clade_label, dist in rel_dists[rank].iteritems():
                if rank in median_rel_dist:
                    delta = dist - median_rel_dist[rank]
                    closest_rank_dist = 1e10
                    for test_rank, test_median in median_rel_dist.iteritems():
                        abs_dist = abs(dist - test_median)
                        if abs_dist < closest_rank_dist:
                            closest_rank_dist = abs_dist
                            closest_rank = Taxonomy.rank_labels[test_rank]

                    classification = "OK"
                    if delta < -0.2:
                        classification = "very overclassified"
                    elif delta < -0.1:
                        classification = "overclassified"
                    elif delta > 0.2:
                        classification = "very underclassified"
                    elif delta > 0.1:
                        classification = "underclassified"

                    fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label,
                                                                   ';'.join(gtdb_parent_ranks[clade_label]),
                                                                   dist,
                                                                   delta,
                                                                   closest_rank,
                                                                   classification))
                else:
                    fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label,
                                                                   ';'.join(gtdb_parent_ranks[clade_label]),
                                                                   dist,
                                                                   -1,
                                                                   'NA',
                                                                   'Insufficent data to calcualte median for rank.'))
        fout.close()
Esempio n. 34
0
    def _median_summary_outlier_file(self, phylum_rel_dists,
                                            taxa_for_dist_inference,
                                            gtdb_parent_ranks,
                                            outlier_table,
                                            rank_file,
                                            verbose_table):
        """Identify outliers relative to the median of rank distributions.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        gtdb_parent_ranks: d[taxon] -> string indicating parent taxa
            Parent taxa for each taxon.
        outlier_table : str
            Desired name of output table.
        rank_file : str
            Desired name of file indicating median relative distance of each rank.
        verbose_table : boolean
            Print additional columns in output table.
        """
        
        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # determine median relative distance for each rank
        median_for_rank = self.rank_median_rd(phylum_rel_dists, taxa_for_dist_inference)
          
        fout_rank = open(rank_file, 'w')
        median_str = []
        for rank in sorted(median_for_rank.keys()):
            median_str.append('"' + Taxonomy.rank_labels[rank] + '":' + str(median_for_rank[rank]))
        fout_rank.write('{' + ','.join(median_str) + '}\n')
        fout_rank.close()
            
        fout = open(outlier_table, 'w')
        if verbose_table:
            fout.write('Taxa\tGTDB taxonomy\tMedian distance')
            fout.write('\tMedian of rank\tMedian difference')
            fout.write('\tClosest rank\tClassifciation\n')
        else:
            fout.write('Taxa\tGTDB taxonomy\tMedian distance\tMedian difference\tClosest rank\tClassification\n')
        
        for rank in sorted(median_for_rank.keys()):
            for clade_label, dists in medians_for_taxa[rank].iteritems():
                dists = np_array(dists)

                taxon_median = np_median(dists)
                delta = taxon_median - median_for_rank[rank]

                closest_rank_dist = 1e10
                for test_rank, test_median in median_for_rank.iteritems():
                    abs_dist = abs(taxon_median - test_median)
                    if abs_dist < closest_rank_dist:
                        closest_rank_dist = abs_dist
                        closest_rank = Taxonomy.rank_labels[test_rank]

                classification = "OK"
                if delta < -0.2:
                    classification = "very overclassified"
                elif delta < -0.1:
                    classification = "overclassified"
                elif delta > 0.2:
                    classification = "very underclassified"
                elif delta > 0.1:
                    classification = "underclassified"

                if verbose_table:
                    fout.write('%s\t%s\t%.2f\t%.3f\t%.3f\t%s\t%s\n' % (clade_label,
                                                                       ';'.join(gtdb_parent_ranks[clade_label]),
                                                                       taxon_median,
                                                                       median_for_rank[rank],
                                                                       delta,
                                                                       closest_rank,
                                                                       classification))
                else:
                    fout.write('%s\t%s\t%.3f\t%.3f\t%s\t%s\n' % (clade_label,
                                                                   ';'.join(gtdb_parent_ranks[clade_label]),
                                                                   taxon_median,
                                                                   delta,
                                                                   closest_rank,
                                                                   classification))
        fout.close()
Esempio n. 35
0
for i in orig_list3:
    more, less = 0, 0
    for j in orig_list3:
        if i > j:
            more += 1
        elif i < j:
            less += 1
    if more == less:
        median = i
        break
print('Медиана:', median)

from numpy import median as np_median

print('Проверим по numpy:', np_median(orig_list3))


def gnome_sort(orig_list):
    i = 1
    while i < len(orig_list):
        if not i or orig_list[i - 1] <= orig_list[i]:
            i += 1
        else:
            orig_list[i], orig_list[i - 1] = orig_list[i - 1], orig_list[i]
            i -= 1
    return orig_list


print('Гномьей сортировкой по возрастанию:', gnome_sort(orig_list3))
print('m-й элемент:', gnome_sort(orig_list3)[m])
Esempio n. 36
0
def create_feat_mat(graph_list, n_feats):
    dens_pos = [nx_density(graph) for graph in graph_list]
    nodes_pos = [nx_number_of_nodes(graph) for graph in graph_list]

    # CC statistics - mean and max  - faster to use a big loop mostly
    CC_mean = []
    CC_mean_append = CC_mean.append
    CC_max = []
    CC_max_append = CC_max.append
    CC_var = []
    CC_var_append = CC_var.append
    # Degree correlation - avg degree of the neighborhood     
    DC_mean = []
    DC_mean_append = DC_mean.append
    DC_max = []
    DC_max_append = DC_max.append
    DC_var = []
    DC_var_append = DC_var.append
    # Degree statistics
    degree_mean = []
    degree_mean_append = degree_mean.append
    degree_max = []
    degree_max_append = degree_max.append
    degree_median = []
    degree_median_append = degree_median.append
    degree_var = []
    degree_var_append = degree_var.append
    # Edge weight statistics 
    edge_wt_mean = []
    edge_wt_mean_append = edge_wt_mean.append
    edge_wt_max = []
    edge_wt_max_append = edge_wt_max.append
    edge_wt_var = []
    edge_wt_var_append = edge_wt_var.append
    # First 3 singular values 
    sv1 = []
    sv1_append = sv1.append
    sv2 = []
    sv2_append = sv2.append
    sv3 = []
    sv3_append = sv3.append
    for graph in graph_list:

        CCs = list(nx_clustering(graph).values())
        CC_max_append(max(CCs))
        CC_mean_append(np_mean(CCs))
        CC_var_append(np_var(CCs))

        DCs = list(nx_average_neighbor_degree(graph).values())
        DC_max_append(max(DCs))
        DC_mean_append(np_mean(DCs))
        DC_var_append(np_var(DCs))

        degrees = [tup[1] for tup in graph.degree()]
        degree_mean_append(np_mean(degrees))
        degree_median_append(np_median(degrees))
        degree_max_append(max(degrees))
        degree_var_append(np_var(degrees))

        edge_wts = [tup[2] for tup in graph.edges.data('weight')]
        edge_wt_mean_append(np_mean(edge_wts))
        edge_wt_var_append(np_var(edge_wts))
        edge_wt_max_append(max(edge_wts))

        A_mat = nx_to_numpy_matrix(graph)
        svs = np_linalg_svd(A_mat, full_matrices=False, compute_uv=False)

        if len(svs) >= 3:
            sv1_append(svs[0])
            sv2_append(svs[1])
            sv3_append(svs[2])
        elif len(svs) >= 2:
            sv1_append(svs[0])
            sv2_append(svs[1])
            sv3_append(0)
        else:
            sv1_append(svs[0])
            sv2_append(0)
            sv3_append(0)

    feat_mat = np_vstack((dens_pos, nodes_pos, degree_max, degree_mean, degree_median, degree_var, CC_max, CC_mean,
                          CC_var, edge_wt_mean, edge_wt_max, edge_wt_var, DC_mean, DC_var, DC_max, sv1, sv2, sv3)).T

    if n_feats == 1:
        feat_mat = np_array(dens_pos).reshape(-1, 1)

    return feat_mat