def create_feat_mat_1(graph):
    CCs = list(nx_clustering(graph).values())

    DCs = list(nx_average_neighbor_degree(graph).values())

    degrees = [tup[1] for tup in graph.degree()]

    edge_wts = [tup[2] for tup in graph.edges.data('weight')]

    A_mat = nx_to_numpy_matrix(graph)
    svs = np_linalg_svd(A_mat, full_matrices=False, compute_uv=False)

    if len(svs) >= 3:
        sv1 = svs[0]
        sv2 = svs[1]
        sv3 = svs[2]
    elif len(svs) >= 2:
        sv1 = svs[0]
        sv2 = svs[1]
        sv3 = 0
    else:
        sv1 = svs[0]
        sv2 = sv3 = 0

    feat_mat = np_vstack(
        (nx_density(graph), nx_number_of_nodes(graph), max(degrees),
         np_mean(degrees), np_median(degrees), np_var(degrees), max(CCs),
         np_mean(CCs), np_var(CCs), np_mean(edge_wts), max(edge_wts),
         np_var(edge_wts), np_mean(DCs), np_var(DCs), max(DCs), sv1, sv2,
         sv3)).T

    return feat_mat
 def resolve_by_intra_specific_ani(self, gid_anis):
     """Resolve by removing intra-specific genomes with divergent ANI values."""
     
     if len(gid_anis) <= 2:
         return False, {}
     
     # consider most divergent genome as untrustworthy
     untrustworthy_gids = {}
     while True:
         # find most divergent genome
         min_ani = 100
         untrustworthy_gid = None
         for gid in gid_anis:
             if gid in untrustworthy_gids:
                 continue
                 
             anis = [ani for cur_gid, ani in gid_anis[gid].items() if cur_gid not in untrustworthy_gids]
             if np_mean(anis) < min_ani:
                 min_ani = np_mean(anis)
                 untrustworthy_gid = gid
         
         untrustworthy_gids[untrustworthy_gid] = f'{min_ani:.2f}% ANI to other type strain genomes'
         
         all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids)
 
         if all_similar:
             return True, untrustworthy_gids
         
         remaining_genomes = len(gid_anis) - len(untrustworthy_gids)
         if remaining_genomes <= 2 or len(untrustworthy_gids) >= len(gid_anis):
             return False, {}
Beispiel #3
0
def get_field(self, axes_list):
    """Returns the values of the field (with symmetries and sums).
    Parameters
    ----------
    self: Data
        a Data object
    axes_list: list
        a list of RequestedAxis objects
    Returns
    -------
    values: ndarray
        values of the field
    """

    values = self.values
    for axis_requested in axes_list:
        # Rebuild symmetries when needed
        axis_symmetries = self.axes[axis_requested.index].symmetries
        if (
            axis_requested.transform == "fft"
            and axis_requested.is_pattern
            or axis_requested.extension in ["sum", "rss", "mean", "rms", "integrate"]
            and axis_requested.is_pattern
        ):
            values = take(values, axis_requested.rebuild_indices, axis_requested.index)
        elif axis_requested.transform == "fft" and "antiperiod" in axis_symmetries:
            nper = axis_symmetries["antiperiod"]
            axis_symmetries["antiperiod"] = 2
            values = rebuild_symmetries(values, axis_requested.index, axis_symmetries)
            axis_symmetries["antiperiod"] = nper
        elif axis_requested.indices is not None:
            if (
                axis_requested.extension in ["sum", "rss", "mean", "rms", "integrate"]
                or max(axis_requested.indices) > values.shape[axis_requested.index]
            ):
                values = rebuild_symmetries(
                    values, axis_requested.index, axis_symmetries
                )
                self.axes[axis_requested.index].symmetries = dict()

        # sum over sum axes
        if axis_requested.extension == "sum":
            values = np_sum(values, axis=axis_requested.index, keepdims=True)
        # root sum square over rss axes
        elif axis_requested.extension == "rss":
            values = sqrt(np_sum(values ** 2, axis=axis_requested.index, keepdims=True))
        # mean value over mean axes
        elif axis_requested.extension == "mean":
            values = np_mean(values, axis=axis_requested.index, keepdims=True)
        # RMS over rms axes
        elif axis_requested.extension == "rms":
            values = sqrt(
                np_mean(values ** 2, axis=axis_requested.index, keepdims=True)
            )
        # integration over integration axes
        elif axis_requested.extension == "integrate":
            values = trapz(
                values, x=axis_requested.values, axis=axis_requested.index
            ) / (np_max(axis_requested.values) - np_min(axis_requested.values))
    return values
def predict_image(path_of_image, groupStage):
    path_of_model = os_path.join("./CUSTOMIZE_4_USER/MODEL_TRAINING",
                                 groupStage, groupStage + ".pth")
    path_of_feature = os_path.join("./CUSTOMIZE_4_USER/MODEL_TRAINING",
                                   groupStage, groupStage + ".npz")

    start_time = time()
    model = NeuralNet(input_size, hidden_size, num_classes).to(device)
    model.load_state_dict(load(path_of_model))

    data = np_load(path_of_feature)
    [h_max, s_max, v_max] = data['data_max']
    [h_min, s_min, v_min] = data['data_min']

    img = imread(path_of_image)
    img = resize(img, (6000, 4000))
    img = img[500:-500, 750:-750, :]
    img = cvtColor(img, COLOR_BGR2HSV)
    hchan, schan, vchan = split(img)
    h_hist = calcHist([img], [0], None, [256], [0, 256]).reshape(256, )
    s_hist = calcHist([img], [1], None, [256], [0, 256]).reshape(256, )
    v_hist = calcHist([img], [2], None, [256], [0, 256]).reshape(256, )

    hMean = np_mean(hchan) / 255
    DPV_h_max = np_sum(np_absolute(h_hist - h_max)) / (HEIGHT * WIDTH)
    DPV_h_min = np_sum(np_absolute(h_hist - h_min)) / (HEIGHT * WIDTH)

    sMean = np_mean(schan) / 255
    DPV_s_max = np_sum(np_absolute(s_hist - s_max)) / (HEIGHT * WIDTH)
    DPV_s_min = np_sum(np_absolute(s_hist - s_min)) / (HEIGHT * WIDTH)

    vMean = np_mean(vchan) / 255
    DPV_v_max = np_sum(np_absolute(v_hist - v_max)) / (HEIGHT * WIDTH)
    DPV_v_min = np_sum(np_absolute(v_hist - v_min)) / (HEIGHT * WIDTH)

    correlation = np_corrcoef(h_hist, s_hist)[0][1]

    #image_feature = np_array((hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, vMean, DPV_v_max, DPV_v_min))
    image_feature = np_array((hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max,
                              DPV_s_min, correlation))
    image_feature = from_numpy(image_feature).to(device).float().view(
        1, input_size)

    with no_grad():
        out_predict = model(image_feature)
        _, predicted_result = torch_max(out_predict.data, 1)
        original = Tensor([[1, 33, 66, 99]])

    # Round xx.xx %
    percentage_result = np_round(
        mm(out_predict.view(1, num_classes), original.view(num_classes,
                                                           1)).item(), 2)

    # Processed time
    processedTime = np_round(time() - start_time, 2)
    #print("Time  ",processedTime)

    return percentage_result, processedTime
Beispiel #5
0
    def compute(self):
        """
        """
        self.s.sort(axis=2)

        a_sig = np_mean(self.s[:,:,-25:], axis=2)
        a_noise = np_mean(self.s[:,:,1:52], axis=2)

        self.snr = 20*np_log10(a_sig/a_noise)
    def set_plots(self, data_dict):

        #        self.controller = ThreeDController()
        #        self.controller.show()
        self.data_dict = data_dict
        utils.iface.mapCanvas().saveAsImage(
            os.path.join(self.prefs.CHACHE_BASE_DIR, 'canvas.png'))
        self.canvas = np.asarray(
            im.open(os.path.join(self.prefs.CHACHE_BASE_DIR, 'canvas.png')))

        self.set_k(data_dict)

        xMax = utils.iface.mapCanvas().extent().xMaximum()
        xMin = utils.iface.mapCanvas().extent().xMinimum()
        yMax = utils.iface.mapCanvas().extent().yMaximum()
        yMin = utils.iface.mapCanvas().extent().yMinimum()

        ref_gl_obj = self.plot.add_reference_surface(xMin, xMax, yMin, yMax,
                                                     self.canvas)
        self.reference_surf = ThreeDDataSurf(ref_gl_obj, self.plot, 0)

        self.xoff, self.yoff = self.get_xy_offs(data_dict)

        for orbit in data_dict.keys():
            self.orbit_surf_dict[orbit] = {}
            self.orbit_surf_dict[orbit]['data'] = []
            for band in data_dict[orbit].data:
                data = np_mean(
                    band[:, data_dict[orbit].get_range(
                    )[0]:data_dict[orbit].get_range()[1] + 1, :], 0)
                y = np.array(data_dict[orbit].get_proj_y_list())
                x = np.array(data_dict[orbit].get_proj_x_list())
                z = np.linspace(0, data_dict[orbit].get_v_scale(),
                                data.shape[1])
                gl_obj = self.plot.add_surface(x, y, z / 10., data)
                self.orbit_surf_dict[orbit]['data'].append(
                    ThreeDDataSurf(gl_obj, self.plot, 0))


#            if data_dict[orbit].has_key('sim'):
            if data_dict[orbit].sim:
                self.orbit_surf_dict[orbit]['sim'] = []
                for band in data_dict[orbit].sim:
                    data = np_mean(
                        band[:, data_dict[orbit].get_range(
                        )[0]:data_dict[orbit].get_range()[1] + 1, :], 0)
                    y = np.array(data_dict[orbit].get_proj_y_list())
                    x = np.array(data_dict[orbit].get_proj_x_list())
                    z = np.linspace(0, data_dict[orbit].get_v_scale(),
                                    data.shape[1])
                    gl_obj = self.plot.add_surface(x, y, z / 10., data)
                    self.orbit_surf_dict[orbit]['sim'].append(
                        ThreeDDataSurf(gl_obj, self.plot, 0))
    def _write_rep_info(self, 
                        clusters, 
                        cluster_sp_names, 
                        quality_metadata, 
                        genome_quality,
                        excluded_from_refseq_note,
                        ani_af,
                        output_file):
        """Write out information about selected representative genomes."""
                                            
        fout = open(output_file, 'w')
        fout.write('Species\tType genome\tNCBI assembly level\tNCBI genome category')
        fout.write('\tGenome size (bp)\tQuality score\tCompleteness (%)\tContamination (%)\tNo. scaffolds\tNo. contigs\tN50 contigs\tAmbiguous bases\tSSU count\tSSU length (bp)')
        fout.write('\tNo. genomes in cluster\tMean ANI\tMean AF\tMin ANI\tMin AF\tNCBI exclude from RefSeq\n')
        
        for gid in clusters:
            fout.write('%s\t%s\t%s\t%s' % (
                        cluster_sp_names[gid], 
                        gid, 
                        quality_metadata[gid].ncbi_assembly_level,
                        quality_metadata[gid].ncbi_genome_category))

            fout.write('\t%d\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.1f\t%d\t%d\t%d' % (
                            quality_metadata[gid].genome_size,
                            genome_quality[gid], 
                            quality_metadata[gid].checkm_completeness,
                            quality_metadata[gid].checkm_contamination,
                            quality_metadata[gid].scaffold_count,
                            quality_metadata[gid].contig_count,
                            quality_metadata[gid].n50_contigs,
                            quality_metadata[gid].ambiguous_bases,
                            quality_metadata[gid].ssu_count,
                            quality_metadata[gid].ssu_length if quality_metadata[gid].ssu_length else 0))
                            
            anis = []
            afs = []
            for cluster_id in clusters[gid]:
                ani, af = symmetric_ani(ani_af, gid, cluster_id)
                anis.append(ani)
                afs.append(af)
            
            if anis:
                fout.write('\t%d\t%.1f\t%.2f\t%.1f\t%.2f\t%s\n' % (len(clusters[gid]),
                                                                    np_mean(anis), np_mean(afs),
                                                                    min(anis), min(afs),
                                                                    excluded_from_refseq_note.get(gid, '')))
            else:
                fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' % (len(clusters[gid]),
                                                            'n/a', 'n/a', 'n/a', 'n/a',
                                                            excluded_from_refseq_note.get(gid, '')))
        fout.close()
Beispiel #8
0
    def _write_rep_info(self, 
                        clusters, 
                        cluster_sp_names, 
                        quality_metadata, 
                        genome_quality,
                        excluded_from_refseq_note,
                        ani_af,
                        output_file):
        """Write out information about selected representative genomes."""
                                            
        fout = open(output_file, 'w')
        fout.write('Species\tType genome\tNCBI assembly level\tNCBI genome category')
        fout.write('\tGenome size (bp)\tQuality score\tCompleteness (%)\tContamination (%)\tNo. scaffolds\tNo. contigs\tN50 contigs\tAmbiguous bases\tSSU count\tSSU length (bp)')
        fout.write('\tNo. genomes in cluster\tMean ANI\tMean AF\tMin ANI\tMin AF\tNCBI exclude from RefSeq\n')
        
        for gid in clusters:
            fout.write('%s\t%s\t%s\t%s' % (
                        cluster_sp_names[gid], 
                        gid, 
                        quality_metadata[gid].ncbi_assembly_level,
                        quality_metadata[gid].ncbi_genome_category))

            fout.write('\t%d\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.1f\t%d\t%d\t%d' % (
                            quality_metadata[gid].genome_size,
                            genome_quality[gid], 
                            quality_metadata[gid].checkm_completeness,
                            quality_metadata[gid].checkm_contamination,
                            quality_metadata[gid].scaffold_count,
                            quality_metadata[gid].contig_count,
                            quality_metadata[gid].n50_contigs,
                            quality_metadata[gid].ambiguous_bases,
                            quality_metadata[gid].ssu_count,
                            quality_metadata[gid].ssu_length if quality_metadata[gid].ssu_length else 0))
                            
            anis = []
            afs = []
            for cluster_id in clusters[gid]:
                ani, af = symmetric_ani(ani_af, gid, cluster_id)
                anis.append(ani)
                afs.append(af)
            
            if anis:
                fout.write('\t%d\t%.1f\t%.2f\t%.1f\t%.2f\t%s\n' % (len(clusters[gid]),
                                                                    np_mean(anis), np_mean(afs),
                                                                    min(anis), min(afs),
                                                                    excluded_from_refseq_note.get(gid, '')))
            else:
                fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' % (len(clusters[gid]),
                                                            'n/a', 'n/a', 'n/a', 'n/a',
                                                            excluded_from_refseq_note.get(gid, '')))
        fout.close()
    def _nonrep_radius(self, unclustered_gids, rep_gids, ani_af_rep_vs_nonrep):
        """Calculate circumscription radius for unclustered, nontype genomes."""
        
        # set radius for genomes to default values
        nonrep_radius = {}
        for gid in unclustered_gids:
            nonrep_radius[gid] = GenomeRadius(ani = self.ani_sp, 
                                                     af = None,
                                                     neighbour_gid = None)

        # determine closest type ANI neighbour and restrict ANI radius as necessary
        ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb'))
        for nonrep_gid in unclustered_gids:
            if nonrep_gid not in ani_af:
                continue
                    
            for rep_gid in rep_gids:
                if rep_gid not in ani_af[nonrep_gid]:
                    continue
                    
                ani, af = symmetric_ani(ani_af, nonrep_gid, rep_gid)

                if ani > nonrep_radius[nonrep_gid].ani and af >= self.af_sp:
                    nonrep_radius[nonrep_gid] = GenomeRadius(ani = ani, 
                                                             af = af,
                                                             neighbour_gid = rep_gid)
                    
        self.logger.info('ANI circumscription radius: min={:.2f}, mean={:.2f}, max={:.2f}'.format(
                                min([d.ani for d in nonrep_radius.values()]), 
                                np_mean([d.ani for d in nonrep_radius.values()]), 
                                max([d.ani for d in nonrep_radius.values()])))
                        
        return nonrep_radius
Beispiel #10
0
def entropy_batch_mixing_(latents,
                          batches,
                          n_neighs=50,
                          n_pools=50,
                          n_samples=100):
    def cross_entropy(data):
        n_batches = len(unique(data))
        assert n_batches == 2, ValueError(
            "Entropy can be calculated with only 2 batches")

        freq = np_mean(data == unique(data)[0])
        if freq == 0 or freq == 1:
            return 0

        return -freq * log(freq) - (1 - freq) * log(1 - freq)

    n_neighs = min(n_neighs, latents.shape[0] - 1)
    knn = NearestNeighbors(n_neighbors=n_neighs + 1, n_jobs=8)
    knn.fit(latents)
    kmatrix = knn.kneighbors_graph(latents) - scipy.sparse.identity(
        latents.shape[0])

    score = 0
    #pdb.set_trace()
    for t in range(n_pools):
        indices = choice(arange(latents.shape[0]), size=n_samples)
        while unique(batches[kmatrix[indices].nonzero()[1][
                kmatrix[indices].nonzero()[0] == 1]]).shape[0] < 2:
            indices = choice(arange(latents.shape[0]), size=n_samples)
        score += np_mean([
            cross_entropy(batches[kmatrix[indices].nonzero()[1][
                kmatrix[indices].nonzero()[0] == 1]]) for i in range(n_samples)
        ])

    return score / n_pools
Beispiel #11
0
    def set_data_vol(self,rates_mat,rates_mat_dv_indx=1):
        """
        Calculates the total data volume that can be sent over this link. Uses average data rate to determine data volume. Depending on how much the input data rates matrix is decimated, this could lead to over or underestimates of data volume.

        :return:
        """

        # Note: float[num_timepoints][2] rates_mat: matrix of datarates at each time during the pass. First column is time in MJD, and second column is data rate from sat to xsat in Mbps, third is rate from xsat to sat.

        start_mjd = tt.datetime2mjd(self.start)-5/86400.0  # add 5 secs padding to evade any precision problems
        end_mjd = tt.datetime2mjd(self.end)+5/86400.0  # add 5 secs padding to evade any precision problems

        #  this is fixed in the structure of the data rates output file
        rates_mat_tp_indx = 0;

        data_rates = []
        for i in range(len(rates_mat)):
            # if point i is within window -  this should take care of any indexing issues
            if rates_mat[i][rates_mat_tp_indx] >= start_mjd and rates_mat[i][rates_mat_tp_indx] <= end_mjd:
                data_rates.append(rates_mat[i][rates_mat_dv_indx])

        try:
            #  take the average of all the data rates we saw and multiply by the duration of the window to get data volume
            self.data_vol = np_mean(data_rates) * (self.end - self.start).total_seconds()
            if self.original_data_vol is None:
                self.original_data_vol = self.data_vol
        except RuntimeWarning as e:
            raise RuntimeWarning('Trouble determining average data rate. Probable no time points were found within start and end of window. Ensure that you are not overly decimating data rate calculations in data rates input file (window: %s, exception seen: %s)'%(self,str(e)))
Beispiel #12
0
def statsliste(data, weights=None, masks=None):
    """Return the per-element (weighted) total weight, means, and standard deviations of the data

    Input:
        data: a numpy.array
        weights: an array of weights for the instances [optional]
        masks: an array of masks for the instances [optional]
    Output:
         n: total weight
         mean: mean tensor
         thestd: standard deviation tensor
    """

    if masks is not None:
        data = data[masks]
        weights = weights[masks]

    n = moment0(data, weights)
    if weights is None:
        mu = np_mean(data, 0)
        thestd = std(data, 0)
    else:
        mu = mean(n, moment1(data, weights))
        thestd = stde(n, moment2e(data, weights), mu)
    return (n, mu, thestd)
Beispiel #13
0
def statslist(data, weights=None, masks=None):
    """Return the (weighted) total weight, mean, and covariance of the data

    Input:
        data: a numpy.array
        weights: an array of weights for the instances [optional]
        masks: an array of masks for the instances [optional]
    Output:
         n: total weight
         mean: mean tensor
         thecov: covariance tensor
    """

    if masks is not None:
        data = data[masks]
        if weights is not None:
            weights = weights[masks]

    n = moment0(data, weights)
    if weights is None:
        mu = np_mean(data, 0)
        theshape = data[0].shape
        thesize = data[0].size
        thecov = cov(data.reshape(n, thesize), None, 0).reshape(theshape * 2)
    else:
        mu = mean(n, moment1(data, weights))
        thecov = covariance(n, moment2(data, weights), mu)
    return (n, mu, thecov)
Beispiel #14
0
 def aggregateResources(self, nbins=20):
     """ returns a json object which contains max, min, mean, median, 
         and the histogram itself for all memories/cpu 
         WARNING: this method is not particularly efficient 
         and shouldn't be used lightly!
     """
     allData = {"memory": {"data": []}, "cpu": {"data": []}}
     query = JobInstance.objects.filter(job=self).only("cpu").only("memory")
     if query.count():
         for inst in query:
             agg = inst.aggregateResources()
             for key in ['cpu', 'memory']:
                 if len(agg[key]):
                     allData[key]['data'].append(max(agg[key]))
         del query
         # finished aggregation, now we can do calculations
         for key in allData:
             d = allData[key]["data"]
             allData[key]["max"] = max(d)
             allData[key]["min"] = min(d)
             arr = np_array(d, dtype=float)
             allData[key]["mean"] = float(np_mean(arr, axis=0))
             allData[key]["median"] = float(np_median(arr, axis=0))
             hist, bins = np_hist(arr, nbins)
             center = (bins[:-1] + bins[1:]) / 2
             w = (bins[1] - bins[0])
             histo = np_array([center, hist])
             allData[key]['histogram'] = {
                 "histo": histo.tolist(),
                 "histoT": histo.T.tolist(),
                 "binWidth": float(w)
             }
             del allData[key]['data']
     return dumps(allData)
def transfer_same_dist(test_list, train_list, com_comp, test_rem):
    if len(test_rem) == 0:
        return test_list, train_list, com_comp

    sizes = [len(line) for line in test_rem]
    mean_test_size = np_mean(sizes)
    sd = sqrt(np_var(sizes))
    if sd != 0:
        test_rem_dist = norm_dist(mean_test_size, sd)
        p_dist = [test_rem_dist.pdf(len(line)) for line in train_list]
        norm_ct = sum(p_dist)
        if norm_ct != 0:
            p_dist = [val / norm_ct for val in p_dist]
        train_rem = rand_choice(train_list,
                                size=com_comp,
                                replace=False,
                                p=p_dist)
    else:
        train_rem = [
            line for line in train_list if len(line) == mean_test_size
        ][:com_comp]
    test_list = test_list + train_rem
    for line in train_rem:
        train_list.remove(line)
    return test_list, train_list
    def _nontype_radius(self, unclustered_gids, type_gids, ani_af_nontype_vs_type):
        """Calculate circumscription radius for unclustered, nontype genomes."""
        
        # set type radius for all type genomes to default values
        nontype_radius = {}
        for gid in unclustered_gids:
            nontype_radius[gid] = GenomeRadius(ani = self.ani_sp, 
                                                     af = None,
                                                     neighbour_gid = None)

        # determine closest type ANI neighbour and restrict ANI radius as necessary
        ani_af = pickle.load(open(ani_af_nontype_vs_type, 'rb'))
        for nontype_gid in unclustered_gids:
            if nontype_gid not in ani_af:
                continue
                    
            for type_gid in type_gids:
                if type_gid not in ani_af[nontype_gid]:
                    continue
                    
                ani, af = symmetric_ani(ani_af, nontype_gid, type_gid)

                if ani > nontype_radius[nontype_gid].ani and af >= self.af_sp:
                    nontype_radius[nontype_gid] = GenomeRadius(ani = ani, 
                                                                 af = af,
                                                                 neighbour_gid = type_gid)
                    
        self.logger.info('ANI circumscription radius: min=%.2f, mean=%.2f, max=%.2f' % (
                                min([d.ani for d in nontype_radius.values()]), 
                                np_mean([d.ani for d in nontype_radius.values()]), 
                                max([d.ani for d in nontype_radius.values()])))
                        
        return nontype_radius
Beispiel #17
0
def within_cluster_similarity_statistics(cluster):
    """ Calculate the sequence similarities within a cluster.

    Return the similarity matrix.
    """
    representations = cluster.seqs
    _representations = cluster.seqs_as_list()
    lenrep = len(_representations)

    similarities = np.ones((lenrep, lenrep, 3))
    for j in range(lenrep):
        for k in range(j + 1, lenrep):
            # calculate once
            sim = diff_sequences(_representations[j], _representations[k])
            # but fill both triangles of the matrix
            similarities[j, k, :] = [
                representations[j].id, representations[k].id, sim
            ]
            similarities[k, j, :] = [
                representations[k].id, representations[j].id, sim
            ]

    average_rep_sim = np_mean(similarities[:, :, 2])
    var_rep_sim = np_var(similarities[:, :, 2])

    return similarities, average_rep_sim, var_rep_sim
Beispiel #18
0
def get_knn_purity(latents, labels, n_neighs=30):
    latents = latents.cpu().detach().numpy() if isinstance(latents,
                                                           Tensor) else latents
    labels = labels.cpu().detach().numpy() if isinstance(labels,
                                                         Tensor) else labels

    nbrs = NearestNeighbors(n_neighbors=n_neighs + 1).fit(latents)
    indices = nbrs.kneighbors(latents, return_distance=False)[:, 1:]
    neigh_labels = vectorize(lambda x: labels[x])(indices)

    scores = ((neigh_labels - labels.reshape(-1, 1)) == 0).mean(axis=1)
    res = [
        np_mean(scores[labels.reshape(scores.shape) == i])
        for i in unique(labels)
    ]

    return np_mean(res)
Beispiel #19
0
 def transformCP(self, timer, silent=False, nolog=False):
     """Do the main transformation on the coverage profile data"""
     if(not silent):
         print "    Reticulating splines"
     self.transformedCP = self.dataManager.getTransformedCoverageProfiles(self.dbFileName, indices=self.indices)
     self.corners = self.dataManager.getTransformedCoverageCorners(self.dbFileName)
     self.TCentre = np_mean(self.corners, axis=0)
     self.transRadius = np_norm(self.corners[0] - self.TCentre)
        def process_feature(list_path, labelFeature):
            list_dir = sorted(listdir(list_path))
            if list_dir == []:
                return -1
            for image_path in list_dir:
                name_image = os_path.join(list_path, image_path)
                if name_image == imgSample1 or name_image == imgSample2:
                    continue
                img = imread(name_image)
                img = resize(img, (6000, 4000))
                img = img[500:-500, 750:-750, :]
                img = cvtColor(img, COLOR_BGR2HSV)
                hchan, schan, vchan = split(img)
                h_hist = calcHist([img], [0], None, [256],
                                  [0, 256]).reshape(256, )
                s_hist = calcHist([img], [1], None, [256],
                                  [0, 256]).reshape(256, )
                v_hist = calcHist([img], [2], None, [256],
                                  [0, 256]).reshape(256, )

                hMean = np_mean(hchan) / 255
                DPV_h_max = np_sum(
                    np_absolute(h_hist - h_max)) / (HEIGHT * WIDTH)
                DPV_h_min = np_sum(
                    np_absolute(h_hist - h_min)) / (HEIGHT * WIDTH)

                sMean = np_mean(schan) / 255
                DPV_s_max = np_sum(
                    np_absolute(s_hist - s_max)) / (HEIGHT * WIDTH)
                DPV_s_min = np_sum(
                    np_absolute(s_hist - s_min)) / (HEIGHT * WIDTH)

                vMean = np_mean(vchan) / 255
                DPV_v_max = np_sum(
                    np_absolute(v_hist - v_max)) / (HEIGHT * WIDTH)
                DPV_v_min = np_sum(
                    np_absolute(v_hist - v_min)) / (HEIGHT * WIDTH)

                correlation = np_corrcoef(h_hist, s_hist)[0][1]
                # variable = [hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, vMean, DPV_v_max, DPV_v_min]
                variable = [
                    hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min,
                    correlation
                ]
                feature.append(variable)
                labels.append([labelFeature])
Beispiel #21
0
    def write_rank_count(self, ranks_below_taxon, results_table):
        """Write table indicating number of ranks below each taxa.

        Parameters
        ----------
        ranks_below_taxon : d[taxon][rank prefix] -> count, or list of counts
            Number of ranks below named taxon.
        results_table : str
            Desired output file.
        """
        
        # determine if count is a scalar or vectors
        taxon = list(ranks_below_taxon.keys())[0]
        rank_prefix = list(ranks_below_taxon[taxon].keys())[0]
        count = ranks_below_taxon[taxon][rank_prefix]
        
        count_is_scalar = True
        if isinstance(count, (list, tuple)):
            count_is_scalar = False
        
        # write out results sorted by taxonomic rank        
        sorted_taxon = []
        for rank_prefix in (['root'] + list(Taxonomy.rank_prefixes) + ['RS_', 'GB_', 'U_']):
            taxa_at_rank = []
            for taxon in ranks_below_taxon:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)
                    
            sorted_taxon += sorted(taxa_at_rank)
            
        fout = open(results_table, 'w')
        fout.write('Taxon')
        for rank_prefix in Taxonomy.rank_prefixes:
            if count_is_scalar:
                fout.write('\t%s' % rank_prefix.capitalize())
            else:
                fout.write('\t%s\t%s\t%s\t%s' % ('Mean: ' + rank_prefix.capitalize(), 
                                                    'Std: ' + rank_prefix.capitalize(),
                                                    'Min: ' + rank_prefix.capitalize(),
                                                    'Max: ' + rank_prefix.capitalize()))
        fout.write('\n')
            
        for taxon in sorted_taxon:
            fout.write(taxon)
            
            for rank_prefix in Taxonomy.rank_prefixes:
                count = ranks_below_taxon[taxon][rank_prefix.capitalize()]
                if count_is_scalar:
                    fout.write('\t%d' % count)
                else:
                    if len(count) > 0:
                        fout.write('\t%.1f\t%.2f\t%d\t%d' % (np_mean(count), np_std(count), min(count), max(count)))
                    else:
                        fout.write('\t%d\t%d\t%d\t%d' % (0, 0, 0, 0))
                    
            fout.write('\n')
                
        fout.close()
Beispiel #22
0
    def write_rank_count(self, ranks_below_taxon, results_table):
        """Write table indicating number of ranks below each taxa.

        Parameters
        ----------
        ranks_below_taxon : d[taxon][rank prefix] -> count, or list of counts
            Number of ranks below named taxon.
        results_table : str
            Desired output file.
        """
        
        # determine if count is a scalar or vectors
        taxon = ranks_below_taxon.keys()[0]
        rank_prefix = ranks_below_taxon[taxon].keys()[0]
        count = ranks_below_taxon[taxon][rank_prefix]
        
        count_is_scalar = True
        if isinstance(count, (list, tuple)):
            count_is_scalar = False
        
        # write out results sorted by taxonomic rank        
        sorted_taxon = []
        for rank_prefix in (['root'] + list(Taxonomy.rank_prefixes) + ['RS_', 'GB_', 'U_']):
            taxa_at_rank = []
            for taxon in ranks_below_taxon:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)
                    
            sorted_taxon += sorted(taxa_at_rank)
            
        fout = open(results_table, 'w')
        fout.write('Taxon')
        for rank_prefix in Taxonomy.rank_prefixes:
            if count_is_scalar:
                fout.write('\t%s' % rank_prefix.capitalize())
            else:
                fout.write('\t%s\t%s\t%s\t%s' % ('Mean: ' + rank_prefix.capitalize(), 
                                                    'Std: ' + rank_prefix.capitalize(),
                                                    'Min: ' + rank_prefix.capitalize(),
                                                    'Max: ' + rank_prefix.capitalize()))
        fout.write('\n')
            
        for taxon in sorted_taxon:
            fout.write(taxon)
            
            for rank_prefix in Taxonomy.rank_prefixes:
                count = ranks_below_taxon[taxon][rank_prefix.capitalize()]
                if count_is_scalar:
                    fout.write('\t%d' % count)
                else:
                    if len(count) > 0:
                        fout.write('\t%.1f\t%.2f\t%d\t%d' % (np_mean(count), np_std(count), min(count), max(count)))
                    else:
                        fout.write('\t%d\t%d\t%d\t%d' % (0, 0, 0, 0))
                    
            fout.write('\n')
                
        fout.close()
Beispiel #23
0
def test(data=None,
         precision_bp=2000,
         nb_bp=3,
         taille_fenetre=10,
         breakp=None,
         abscisse=None):
    """Paramètres"""
    #donnees
    if data == None:
        data = [
            580.38, 581.86, 580.97, 580.8, 579.79, 580.39, 580.42, 580.82,
            581.4, 581.32, 581.44, 581.68, 581.17, 580.53, 580.01, 579.91,
            579.14, 579.16, 579.55, 579.67, 578.44, 578.24, 579.1, 579.09,
            579.35, 578.82, 579.32, 579.01, 579, 579.8, 579.83, 579.72, 579.89,
            580.01, 579.37, 578.69, 578.19, 578.67, 579.55, 578.92, 578.09,
            579.37, 580.13, 580.14, 579.51, 579.24, 578.66, 578.86, 578.05,
            577.79, 576.75, 576.75, 577.82, 578.64, 580.58, 579.48, 577.38,
            576.9, 576.94, 576.24, 576.84, 576.85, 576.9, 577.79, 578.18,
            577.51, 577.23, 578.42, 579.61, 579.05, 579.26, 579.22, 579.38,
            579.1, 577.95, 578.12, 579.75, 580.85, 580.41, 579.96, 579.61,
            578.76, 578.18, 577.21, 577.13, 579.1, 578.25, 577.91, 576.89,
            575.96, 576.8, 577.68, 578.38, 578.52, 579.74, 579.31, 579.89,
            579.96, 579.96, 579.96
        ]
    #valeur du découpage pour trouver les breakpoints
    #nombre de breakpoints > 0

    #Affichage variance et moyenne des données
    print("variance = ", np_var(data))
    print("ecart type = ", np_var(data)**0.5)
    print("moyenne = ", np_mean(data))

    #Calcul de l'intégrale de la gaussienne trouvé
    mu = np_mean(data)
    sig = np_var(data)
    ecart = (max(data) - min(data))
    integral_g = quad(gaussian,
                      min(data) - ecart,
                      max(data) + ecart,
                      args=(mu, sig))
    print("integrale gauss", integral_g)
    print(mu, sig, ecart)

    #Appel de la fonctio SAX
    vector_c, vector_c_fit = sax(data, taille_fenetre)
Beispiel #24
0
    def __writer(self, num_species, output_dir, writer_queue):
        """Write results for each species."""
        
        # gather results for each genome
        output_file = os.path.join(output_dir, 'ani_species.tsv')
        fout = open(output_file, 'w')
        fout.write('Species\tNo. Sampled Genomes\tMean ANI\tMedian ANI\t5th Percentile\t95th Percentile')
        fout.write('\tMean AF\tMedian AF\t5th Percentile\t95th Percentile')
        fout.write('\tSampled Genomes\n')
        
        output_file = os.path.join(output_dir, 'ani.tsv')
        fout_pw = open(output_file, 'w')
        fout_pw.write('Species\tGenome 1\tGenome 2\tANI(1->2)\tANI(2->1)\tAF(1->2)\tAF(2->1)\n')
        processed = 0
        while True:
            species, ani, af, genome_ids, results = writer_queue.get(block=True, timeout=None)
            if species == None:
              break

            processed += 1
            statusStr = 'Finished processing %d of %d (%.2f%%) species.' % (processed,
                                                                            num_species,
                                                                            float(processed) * 100 / num_species)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            fout_pw.write(results)
            
            row = '%s\t%d' % (species, len(genome_ids))
            mean_ani = np_mean(ani)
            p5, median, p95 = np_percentile(ani, [5, 50, 95])
            row += '\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_ani,
                                                    median,
                                                    p5, p95)
            mean_af = np_mean(af)
            p5, median, p95 = np_percentile(af, [5, 50, 95])
            row += '\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_af*100,
                                                    median*100,
                                                    p5*100, p95*100)
            fout.write('%s\t%s\n' % (row, ','.join(genome_ids)))

        sys.stdout.write('\n')

        fout.close()
        fout_pw.close()
Beispiel #25
0
Datei: node.py Projekt: luk-f/SAX
    def computeMeanStDev(self, list_ts_paa):

        mean = full(len(list_ts_paa[0]), inf)
        stdev = full(len(list_ts_paa[0]), inf)
        for i in range(len(list_ts_paa[0])):
            seg_i = [ts[i] for ts in list_ts_paa]
            mean[i] = np_mean(seg_i)
            stdev[i] = std(seg_i)
        return mean, stdev
Beispiel #26
0
 def mean_len_by_words(self)->float:
     """
     mean number of words
     Example:
     >>> m='Sin Documentación del indicador'
     >>> mean_len_bywords(m)
     >>> 7
     """
     str_inp_=self.str_inp
     return round(np_mean(list(map(len,str_inp_.split(" ")))),2)
Beispiel #27
0
 def process_feature(list_path, labelFeature):
     print("Extracting...")
     list_dir = sorted(listdir(list_path))
     if list_dir == []:
         return -1
     for image_path in list_dir:
         name_image = os_path.join(list_path, image_path)
         if name_image == imgSample1 or name_image == imgSample2:
             continue
         img = imread(name_image)
         img = resize(img, (6000,4000))
         img = img[500:-500, 750:-750, :]
         img = cvtColor(img, COLOR_BGR2HSV)
         hchan, schan, vchan = split(img)
         h_hist = calcHist([img], [0], None, [256], [0,256]).reshape(256,)
         s_hist = calcHist([img], [1], None, [256], [0,256]).reshape(256,)
         v_hist = calcHist([img], [2], None, [256], [0,256]).reshape(256,)
         
         # 7 feature consist of :
         # + Compute mean value pixel of H channel
         # + Dissilarity with H channel of "max" image
         # + Dissilarity with H channel of "min" image
         # + Compute mean value pixel of S channel
         # + Dissilarity with S channel of "max" image
         # + Dissilarity with S channel of "min" image
         # + Correlation between histogram of H and S channel
         hMean = np_mean(hchan)/255
         DPV_h_max = np_sum(np_absolute(h_hist - h_max))/(HEIGHT*WIDTH)
         DPV_h_min = np_sum(np_absolute(h_hist - h_min))/(HEIGHT*WIDTH)
         
         sMean = np_mean(schan)/255
         DPV_s_max = np_sum(np_absolute(s_hist - s_max))/(HEIGHT*WIDTH)
         DPV_s_min = np_sum(np_absolute(s_hist - s_min))/(HEIGHT*WIDTH)
         
         vMean = np_mean(vchan)/255
         DPV_v_max = np_sum(np_absolute(v_hist - v_max))/(HEIGHT*WIDTH)
         DPV_v_min = np_sum(np_absolute(v_hist - v_min))/(HEIGHT*WIDTH)
         
         correlation = np_corrcoef(h_hist, s_hist)[0][1]
         # variable = [hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, vMean, DPV_v_max, DPV_v_min]
         variable = [hMean, DPV_h_max, DPV_h_min, sMean, DPV_s_max, DPV_s_min, correlation]
         feature.append(variable)
         labels.append([labelFeature])
Beispiel #28
0
    def cross_entropy(data):
        n_batches = len(unique(data))
        assert n_batches == 2, ValueError(
            "Entropy can be calculated with only 2 batches")

        freq = np_mean(data == unique(data)[0])
        if freq == 0 or freq == 1:
            return 0

        return -freq * log(freq) - (1 - freq) * log(1 - freq)
Beispiel #29
0
    def subsample_msa(self, seqs, markers):
        # type: (dict, list) -> (list, dict)
        """Sample columns from each marker in multiple sequence alignment."""

        alignment_length = len(seqs.values()[0])
        sampled_cols = []
        start = 0
        lack_sufficient_cols = 0
        lack_cols_marker_ids = []
        avg_perc_cols = []
        for marker_id, marker_name, marker_len in markers:
            end = start + marker_len

            valid_cols = self.identify_valid_columns(start, end, seqs)
            assert (len(valid_cols) <= marker_len)  # sanity check

            self.logger.info(
                '%s: S:%d, E:%d, LEN:%d, COLS:%d, PERC:%.1f' %
                (marker_name, start, end, marker_len, len(valid_cols),
                 len(valid_cols) * 100.0 / marker_len))

            avg_perc_cols.append(len(valid_cols) * 100.0 / marker_len)

            if len(valid_cols) < self.subset:
                self.logger.warning('Marker has <%d columns after filtering.' %
                                    self.subset)
                lack_sufficient_cols += 1
                lack_cols_marker_ids.append(marker_id)

            offset_valid_cols = [i + start for i in valid_cols]
            sel_cols = random.sample(offset_valid_cols,
                                     min(self.subset, len(offset_valid_cols)))
            sampled_cols.extend(sel_cols)

            start = end

        mask = [1 if i in sampled_cols else 0 for i in range(alignment_length)]

        self.logger.info(
            'Identified %d of %d marker genes with <%d columns for sampling:' %
            (lack_sufficient_cols, len(markers), self.subset))
        self.logger.info('%s' % ', '.join(lack_cols_marker_ids))
        self.logger.info(
            'Marker genes had %.1f+/-%.1f%% of columns available for selection on average.'
            % (np_mean(avg_perc_cols), np_std(avg_perc_cols)))
        self.logger.info('Final MSA contains %d columns.' % len(sampled_cols))

        # trim columns
        output_seqs = {}
        for seq_id, seq in seqs.iteritems():
            masked_seq = ''.join(
                [seq[i] for i in range(0, len(mask)) if mask[i]])
            output_seqs[seq_id] = masked_seq

        return mask, output_seqs
Beispiel #30
0
def write_clusters(clusters, species, out_file):
    """Write out clustering information."""

    fout = open(out_file, 'w')
    fout.write(
        'NCBI species\tType genome\tNo. clustered genomes\tMean ANI\tMin ANI\tMean AF\tMin AF\tClustered genomes\n'
    )
    for gid in sorted(clusters, key=lambda x: len(clusters[x]), reverse=True):
        if len(clusters[gid]):
            mean_ani = '%.2f' % np_mean([d.ani for d in clusters[gid]])
            min_ani = '%.2f' % min([d.ani for d in clusters[gid]])
            mean_af = '%.2f' % np_mean([d.af for d in clusters[gid]])
            min_af = '%.2f' % min([d.af for d in clusters[gid]])
        else:
            mean_ani = min_ani = mean_af = min_af = 'N/A'
        fout.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' %
                   (species.get(gid, 'unclassified'), gid, len(clusters[gid]),
                    mean_ani, min_ani, mean_af, min_af, ','.join(
                        [d.gid for d in clusters[gid]])))
    fout.close()
Beispiel #31
0
    def _gene_distribution(self, seq_file):
        """Calculate length distribution of sequences."""

        gene_lens = []
        for seq_id, seq in seq_io.read_seq(seq_file):
            gene_lens.append(len(seq))

        p10, p50, p90 = np_percentile(gene_lens, [10, 50, 90])

        return np_mean(gene_lens), max(gene_lens), min(
            gene_lens), p10, p50, p90
    def set_plots(self, data_dict):

#        self.controller = ThreeDController()
#        self.controller.show()
        self.data_dict = data_dict
        utils.iface.mapCanvas().saveAsImage(os.path.join(self.prefs.CHACHE_BASE_DIR,'canvas.png'))
        self.canvas = np.asarray(im.open(os.path.join(self.prefs.CHACHE_BASE_DIR,'canvas.png')))

        self.set_k(data_dict)

        xMax = utils.iface.mapCanvas().extent().xMaximum()
        xMin = utils.iface.mapCanvas().extent().xMinimum()
        yMax = utils.iface.mapCanvas().extent().yMaximum()
        yMin = utils.iface.mapCanvas().extent().yMinimum()

        ref_gl_obj = self.plot.add_reference_surface(xMin, xMax, yMin, yMax, self.canvas)
        self.reference_surf = ThreeDDataSurf(ref_gl_obj, self.plot, 0)

        self.xoff,self.yoff = self.get_xy_offs(data_dict)

        for orbit in data_dict.keys():
            self.orbit_surf_dict[orbit] = {}
            self.orbit_surf_dict[orbit]['data'] = []
            for band in data_dict[orbit].data:
                data = np_mean(band[:,data_dict[orbit].get_range()[0]:data_dict[orbit].get_range()[1]+1,:],0)
                y = np.array(data_dict[orbit].get_proj_y_list())
                x = np.array(data_dict[orbit].get_proj_x_list())
                z = np.linspace(0, data_dict[orbit].get_v_scale(), data.shape[1])
                gl_obj = self.plot.add_surface(x,y,z/10., data)
                self.orbit_surf_dict[orbit]['data'].append(ThreeDDataSurf(gl_obj, self.plot, 0))

#            if data_dict[orbit].has_key('sim'):
            if data_dict[orbit].sim:
                self.orbit_surf_dict[orbit]['sim'] = []
                for band in data_dict[orbit].sim:
                    data = np_mean(band[:,data_dict[orbit].get_range()[0]:data_dict[orbit].get_range()[1]+1,:],0)
                    y = np.array(data_dict[orbit].get_proj_y_list())
                    x = np.array(data_dict[orbit].get_proj_x_list())
                    z = np.linspace(0, data_dict[orbit].get_v_scale(), data.shape[1])
                    gl_obj = self.plot.add_surface(x,y,z/10., data)
                    self.orbit_surf_dict[orbit]['sim'].append(ThreeDDataSurf(gl_obj, self.plot, 0))
def split_meth_orig(perm_lines, inputs):
    fact = inputs['fact']  # 0.99
    split_pt = int(round(len(perm_lines) * fact))
    train_list = [line for line in perm_lines[0:split_pt]]
    test_list = [line for line in perm_lines[split_pt:]]
    # Start with something that has a biased size distribution !!

    sizes = [len(line) for line in train_list]
    train_mean = np_mean(sizes)

    # Transferring some of the smaller complexes to the test list
    train_list_lower_mean = [
        line for line in train_list if len(line) < train_mean
    ]
    perc_transfer = inputs[
        'perc_transfer']  # 0.3 # You can optimize these parameters !
    to_transfer = train_list_lower_mean[:int(
        round(len(train_list_lower_mean) * perc_transfer))]
    test_list = test_list + to_transfer

    # Now remove from train set
    for line in to_transfer:
        train_list.remove(line)

    # Finding complexes in train that share an edge with a complex in test
    com_comp = 10
    while com_comp != 0:  # Do until train and test sets are completely separated

        # Removing super huge complexes also (nodes >30 ) from test set
        test_list = [line for line in test_list if len(line) < 30]

        # REMOVE OVERLAP B/W TRAIN AND TEST DATA
        # Remove complexes from train set sharing two proteins with test set
        train_rem = []
        train_rem_append = train_rem.append
        com_comp = 0
        for train_line in train_list:
            pres = 0
            for test_line in test_list:
                common = len(
                    set(train_line.edges()).intersection(set(test_line.edges)))
                if common >= 1:
                    pres = 1
                    break
            if pres == 1:
                train_rem_append(train_line)
                com_comp += 1

        logging_info("No. of train complexes transferred = %s", str(com_comp))
        test_list = test_list + train_rem
        for t_line in train_rem:
            train_list.remove(t_line)
    return train_list, test_list
Beispiel #34
0
    def _type_genome_radius(self, type_gids, type_genome_ani_file):
        """Calculate circumscription radius for type genomes."""

        # set type radius for all type genomes to default values
        type_radius = {}
        for gid in type_gids:
            type_radius[gid] = GenomeRadius(ani=self.ani_sp,
                                            af=None,
                                            neighbour_gid=None)

        # determine closest ANI neighbour and restrict ANI radius as necessary
        with open(type_genome_ani_file) as f:
            header = f.readline().strip().split('\t')

            type_gid1_index = header.index('Type genome 1')
            type_gid2_index = header.index('Type genome 2')
            ani_index = header.index('ANI')
            af_index = header.index('AF')

            for line in f:
                line_split = line.strip().split('\t')

                type_gid1 = line_split[type_gid1_index]
                type_gid2 = line_split[type_gid2_index]

                if type_gid1 not in type_gids or type_gid2 not in type_gids:
                    continue

                ani = float(line_split[ani_index])
                af = float(line_split[af_index])

                if ani > type_radius[type_gid1].ani:
                    if af < self.af_sp:
                        if ani >= self.ani_sp:
                            self.logger.warning(
                                'ANI for %s and %s is >%.2f, but AF <%.2f [pair skipped].'
                                % (type_gid1, type_gid2, ani, af))
                        continue

                    if ani > self.max_ani_neighbour:
                        self.logger.error('ANI neighbour %s is >%.2f for %s.' %
                                          (type_gid2, ani, type_gid1))

                    type_radius[type_gid1] = GenomeRadius(
                        ani=ani, af=af, neighbour_gid=type_gid2)

        self.logger.info(
            'ANI circumscription radius: min=%.2f, mean=%.2f, max=%.2f' %
            (min([d.ani for d in type_radius.values()
                  ]), np_mean([d.ani for d in type_radius.values()
                               ]), max([d.ani for d in type_radius.values()])))

        return type_radius
def write_clusters(clusters, type_radius, species, out_file):
    """Write out clustering information."""

    fout = open(out_file, 'w')
    fout.write('NCBI species\tType genome')
    fout.write('\tClosest species\tClosest type genome\tANI radius\tAF closest')
    fout.write('\tNo. clustered genomes\tMean ANI\tMin ANI\tMean AF\tMin AF\tClustered genomes\n')
    for gid in sorted(clusters, key=lambda x: len(clusters[x]), reverse=True):
        if len(clusters[gid]):
            mean_ani = '%.2f' % np_mean([d.ani for d in clusters[gid]])
            min_ani = '%.2f' % min([d.ani for d in clusters[gid]])
            mean_af = '%.2f' % np_mean([d.af for d in clusters[gid]])
            min_af = '%.2f' % min([d.af for d in clusters[gid]])
        else:
            mean_ani = min_ani = mean_af = min_af = 'N/A'
        fout.write('%s\t%s' % (
                        species.get(gid, 'unclassified'), 
                        gid))
                        
        ani, af, closest_gid = type_radius[gid]
        if not af:
            af = 0
            
        if not closest_gid or closest_gid == 'N/A':
            closest_gid = 'N/A'
            closest_sp = 'N/A'
        else:
            closest_sp = species[closest_gid]
        
        fout.write('\t%s\t%s\t%.2f\t%.2f' % (closest_sp,
                                                closest_gid,
                                                ani,
                                                af))
                        
        fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' % (
                        len(clusters[gid]),
                        mean_ani, min_ani,
                        mean_af, min_af,
                        ','.join([d.gid for d in clusters[gid]])))
    fout.close()
Beispiel #36
0
 def getCentroidStats(self, profile):
     """Calculate the centroids of the profile"""
     working_list = profile[self.rowIndices]
     
     # return the mean and stdev
     # we divide by std so we need to make sure it's never 0
     tmp_stds = np_std(working_list, axis=0)
     mean_std = np_mean(tmp_stds)
     try:
         std = np_array([x if x != 0 else mean_std for x in tmp_stds])
     except:
         std = mean_std
     return (np_median(working_list,axis=0), std)
Beispiel #37
0
    def getCentroidStats(self, profile):
        """Calculate the centroids of the profile"""
        working_list = profile[self.rowIndices]

        # return the mean and stdev
        # we divide by std so we need to make sure it's never 0
        tmp_stds = np_std(working_list, axis=0)
        mean_std = np_mean(tmp_stds)
        try:
            std = np_array([x if x != 0 else mean_std for x in tmp_stds])
        except:
            std = mean_std
        return (np_median(working_list, axis=0), std)
Beispiel #38
0
def write_clusters(clusters, rep_radius, genomes, out_file):
    """Write out clustering information."""

    fout = open(out_file, 'w')
    fout.write('Representative\tGTDB species\tNCBI species')
    fout.write(
        '\tClosest GTDB species\tClosest representative\tANI radius\tAF closest'
    )
    fout.write(
        '\tNo. clustered genomes\tMean ANI\tMin ANI\tMean AF\tMin AF\tClustered genomes\n'
    )
    for gid in sorted(clusters, key=lambda x: len(clusters[x]), reverse=True):
        if clusters[gid]:
            mean_ani = '%.2f' % np_mean([d.ani for d in clusters[gid]])
            min_ani = '%.2f' % min([d.ani for d in clusters[gid]])
            mean_af = '%.2f' % np_mean([d.af for d in clusters[gid]])
            min_af = '%.2f' % min([d.af for d in clusters[gid]])
        else:
            mean_ani = min_ani = mean_af = min_af = 'N/A'
        fout.write('%s\t%s\t%s' % (gid, genomes[gid].gtdb_taxa.species,
                                   genomes[gid].ncbi_taxa.species))

        ani, af, closest_gid = rep_radius[gid]
        if not af:
            af = 0

        if not closest_gid or closest_gid == 'N/A':
            closest_gid = 'N/A'
            closest_sp = 'N/A'
        else:
            closest_sp = genomes[closest_gid].gtdb_taxa.species

        fout.write('\t%s\t%s\t%f\t%f' % (closest_sp, closest_gid, ani, af))

        fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' %
                   (len(clusters[gid]), mean_ani, min_ani, mean_af, min_af,
                    ','.join([d.gid for d in clusters[gid]])))
    fout.close()
Beispiel #39
0
    def _num_lineages(self, tree, threshold):
        """Produce table with number of lineage for increasing mean branch lengths

        Parameters
        ----------
        tree : dendropy Tree
            Input tree.
        threshold : float
            Mean distance to terminal taxa used to define lineages.
            
        Returns
        -------
        int
            Number of lineages with multiple taxa.
        int 
            Number of lineage represented by single leaf node.
        """
        
        stack = [tree.seed_node]
        num_lineages = 0
        num_terminal_lineages = 0
        while stack:
            node = stack.pop()
            
            # check if node is a leaf
            if node.is_leaf():
                num_terminal_lineages += 1
                continue
                                
            # check if node meets mean branch length criterion
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))
                
            if np_mean(dists_to_tips) > threshold:
                for c in node.child_node_iter():
                    stack.append(c)
                continue
                
            num_lineages += 1

            
        return num_lineages, num_terminal_lineages
 def _rep_genome_stats(self, clusters, genome_files):
     """Calculate statistics relative to representative genome."""
     
     self.logger.info('Calculating statistics to cluster representatives:')
     stats = {}
     for idx, (rid, cids) in enumerate(clusters.items()):
         if len(cids) == 0:
             stats[rid] = self.RepStats(min_ani = -1,
                                         mean_ani = -1,
                                         std_ani = -1,
                                         median_ani = -1)
         else:
             # calculate ANI to representative genome
             gid_pairs = []
             for cid in cids:
                 gid_pairs.append((cid, rid))
             ani_af = self.ani_cache.fastani_pairs(gid_pairs, 
                                                     genome_files, 
                                                     report_progress=False)
             
             # calculate statistics
             anis = [ani_af[cid][rid][0] for cid in cids]
             stats[rid] = self.RepStats(min_ani = min(anis),
                                         mean_ani = np_mean(anis),
                                         std_ani = np_std(anis),
                                         median_ani = np_median(anis))
                                         
         statusStr = '-> Processing %d of %d (%.2f%%) clusters.'.ljust(86) % (
                             idx+1, 
                             len(clusters), 
                             float((idx+1)*100)/len(clusters))
         sys.stdout.write('%s\r' % statusStr)
         sys.stdout.flush()
             
     sys.stdout.write('\n')
         
     return stats
Beispiel #41
0
    def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature.
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations : int
            Iterations of clustering to perform.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('')
        self.logger.info('  Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.iteritems():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in xrange(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of tetranucleotide signatures
        if K != 0:
            if not no_pca:
                self.logger.info('  Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info('    First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100))
    
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('  Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            print '  Whitening data.'
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info('  Partitioning genome into %d clusters.' % num_clusters)

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('    Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
Beispiel #42
0
    def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir):
        """Calculate distribution of branch lengths at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        taxonomy_file : str
            File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree).
        output_dir : str
            Desired output directory.
        """

        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            
        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
        
        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, set(), min_children, -1)
        
        # determine branch lengths to leaves for named lineages
        rank_bl_dist = defaultdict(list)
        taxa_bl_dist = defaultdict(list)
        taxa_at_rank = defaultdict(list)
        for node in tree.postorder_node_iter():
            if node.is_leaf() or not node.label:
                continue
                
            _support, taxon, _auxiliary_info = parse_label(node.label)
            if not taxon:
                continue
                
            # get most specific rank in multi-rank taxa string
            taxa = [t.strip() for t in taxon.split(';')]
            taxon = taxa[-1]
            
            most_specific_rank = taxon[0:3]
            taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon)
                
            for n in node.leaf_iter():
                dist_to_node = 0
                while n != node:
                    dist_to_node += n.edge_length
                    n = n.parent_node
                
                for t in taxa:
                    taxa_bl_dist[t].append(dist_to_node)

            rank = Taxonomy.rank_labels[Taxonomy.rank_index[most_specific_rank]]
            if rank != 'species' or Taxonomy().validate_species_name(taxon):
                if taxon in taxa_for_dist_inference:
                    rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon]))
                            
        # report number of taxa at each rank
        print ''
        print 'Rank\tTaxa\tTaxa for Inference'
        for rank, taxa in taxa_at_rank.iteritems():
            taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
            print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
        print ''
                    
        # report results sorted by rank
        sorted_taxon = []
        for rank_prefix in Taxonomy.rank_prefixes:
            taxa_at_rank = []
            for taxon in taxa_bl_dist:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)
                    
            sorted_taxon += sorted(taxa_at_rank)
                
        # report results for each named group
        taxa_file = os.path.join(output_dir, 'taxa_bl_dist.tsv')
        fout = open(taxa_file, 'w')
        fout.write('Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for taxon in sorted_taxon:
            dist = taxa_bl_dist[taxon]

            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon,
                                                                str(taxon in taxa_for_dist_inference),
                                                                np_mean(dist),
                                                                np_std(dist),
                                                                p[0], p[1], p[2], p[3], p[4]))
        fout.close()
        
        # report results for each taxonomic rank
        rank_file = os.path.join(output_dir, 'rank_bl_dist.tsv')
        fout = open(rank_file, 'w')
        fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for rank in Taxonomy.rank_labels:
            dist = rank_bl_dist[rank]
            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank,
                                                                np_mean(dist),
                                                                np_std(dist),
                                                                p[0], p[1], p[2], p[3], p[4]))
        fout.close()
        
    def __init__(self,
                 orbit,
                 orbit_dict,
                 q_rects = None,
                 roi_movable = False,
                 lock_aspect = True,
                 parent = None,
                 labels = 1,
                 x_label = 'x',
                 y_label = 'y',
                 x_unit = "",
                 y_unit = "",
                 v_offset = (0,0),
                 prefs = None,
                 depth_meas = True,
                 iface = None):

        super(OrbitViewer, self).__init__(parent)

        self.plots = []
        data_f = []
        sim_f = []
        self.v_offset = v_offset
        self.v_offset_data = self.v_offset[0]
        self.v_offset_sim = self.v_offset[1]
        self.orbit_label = orbit_dict.get_instrument() + " - Orbit "+str(orbit)
        self.x_unit = x_unit
        self.y_unit = y_unit
        self.orbit_dict=orbit_dict
        self.prefs = prefs
        self.iface = iface

        if orbit_dict.data:
            for band in orbit_dict.data:
                data_f.append(np_mean(band,0))

        else:
            for band in orbit_dict.sim:
                data_f.append(np_zeros(band.shape[1:]))


        if orbit_dict.sim:
            for band in orbit_dict.sim:
                sim_f.append(np_mean(band,0))

        else:
            for band in orbit_dict.data:
                sim_f.append(np_zeros(band.shape[1:]))

        ii = 0
        for band in orbit_dict.data:
            depth_cb = CreateDepthLayer(self.orbit_dict, ii, QgsProject.instance().readPath("./"), self.iface)
            self.plots.append(SinglePlot(images = [data_f[ii], sim_f[ii]],
                                         images_label = ["data", "sim"],
                                         label_text = self.orbit_label+" Frequency band "+str(ii+1),
                                         q_rects = q_rects,
                                         roi_movable = roi_movable,
                                         lock_aspect = lock_aspect,
                                         x_label = x_label,
                                         y_label = y_label,
                                         x_unit = x_unit,
                                         y_unit = y_unit,
                                         depth_cb = depth_cb.run,
                                         depth_meas = depth_meas))

            self.addItem(self.plots[-1], row=0, col=(ii))

            ii = ii + 1

        self.set_pos_label(0)
    def _pairwise_stats(self, clusters, genome_files):
        """Calculate statistics for all pairwise comparisons in a species cluster."""
        
        self.logger.info('Calculating statistics for all pairwise comparisons in a species cluster:')
        stats = {}
        for idx, (rid, cids) in enumerate(clusters.items()):
            statusStr = '-> Processing %d of %d (%.2f%%) clusters (size = %d).'.ljust(86) % (
                                idx+1, 
                                len(clusters), 
                                float((idx+1)*100)/len(clusters),
                                len(cids))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
                                
            if len(cids) == 0:
                stats[rid] = self.PairwiseStats(min_ani = -1,
                                                mean_ani = -1,
                                                std_ani = -1,
                                                median_ani = -1,
                                                ani_to_medoid = -1,
                                                mean_ani_to_medoid = -1,
                                                ani_below_95 = -1)
            else:
                if len(cids) > self.max_genomes_for_stats:
                    cids = set(random.sample(cids, self.max_genomes_for_stats))
                
                # calculate ANI to representative genome
                gid_pairs = []
                gids = list(cids.union([rid]))
                for gid1, gid2 in combinations(gids, 2):
                    gid_pairs.append((gid1, gid2))
                    gid_pairs.append((gid2, gid1))
                    
                ani_af = self.ani_cache.fastani_pairs(gid_pairs, 
                                                        genome_files, 
                                                        report_progress=False)
                                                        
                # calculate medoid point
                if len(gids) > 2:
                    dist_mat = np_zeros((len(gids), len(gids)))
                    for i, gid1 in enumerate(gids):
                        for j, gid2 in enumerate(gids):
                            if i < j:
                                ani, af = symmetric_ani(ani_af, gid1, gid2)
                                dist_mat[i, j] = ani
                                dist_mat[j, i] = ani

                    medoid_idx = np_argmin(dist_mat.sum(axis=0))
                    medoid_gid = gids[medoid_idx]
                else:
                    # with only 2 genomes in a cluster, the representative is the
                    # natural medoid at least for reporting statistics for the
                    # individual species cluster
                    medoid_gid = rid
                    
                mean_ani_to_medoid = np_mean([symmetric_ani(ani_af, gid, medoid_gid)[0] 
                                                for gid in gids if gid != medoid_gid])

                # calculate statistics
                anis = []
                for gid1, gid2 in combinations(gids, 2):
                    ani, af = symmetric_ani(ani_af, gid1, gid2)
                    anis.append(ani)
                    
                stats[rid] = self.PairwiseStats(min_ani = min(anis),
                                                mean_ani = np_mean(anis),
                                                std_ani = np_std(anis),
                                                median_ani = np_median(anis),
                                                ani_to_medoid = symmetric_ani(ani_af, rid, medoid_gid)[0],
                                                mean_ani_to_medoid = mean_ani_to_medoid,
                                                ani_below_95 = sum([1 for ani in anis if ani < 95]))

        sys.stdout.write('\n')
            
        return stats
Beispiel #45
0
    def identify(self, scaffold_stats, genome_stats,
                        gc_per, td_per,
                        cov_corr, cov_perc,
                        report_type, output_file):
        """Identify scaffolds with divergent genomic characteristics.

        Outliers are identified independently based on GC content,
        tetranucleotide signatures, coverage profile correlation, and
        mean absolute percent error of coverage profile. The coverage correlation
        check is ignored if the coverage profile consists of a single value.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        genome_stats : GenomeStats
            Statistics for individual genomes.
        gc_per : int.
            Percentile for identifying GC outliers
        td_per : int
            Percentile for identifying TD outliers.
        cov_corr : int
            Correlation for identifying divergent coverage profiles.
        cov_perc : int
            Mean absolute percent error for identifying divergent coverage profiles.
        report_type : str
            Report scaffolds that are outliers in 'all' or 'any' distribution.
        output_file : str
            Name of output file.
        """

        # read reference distributions from file
        self.logger.info('  Reading reference distributions.')
        self.gc_dist = self._read_distribution('gc_dist')
        self.td_dist = self._read_distribution('td_dist')

        # identify outliers in each genome
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tOutlying distributions')
        fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per))
        fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per)
        fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error\n')

        genomic_signature = GenomicSignature(0)

        processed_genomes = 0
        for genome_id, scaffold_ids in scaffold_stats.scaffolds_in_genome.iteritems():
            processed_genomes += 1

            sys.stdout.write('    Finding outliers in %d of %d (%.1f%%) genomes.\r' % (processed_genomes,
                                                                                     scaffold_stats.num_genomes(),
                                                                                     processed_genomes * 100.0 / scaffold_stats.num_genomes()))
            sys.stdout.flush()

            # find keys into GC and TD distributions
            # gc -> [mean GC][scaffold length][percentile]
            # td -> [scaffold length][percentile]
            gs = genome_stats[genome_id]
            closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0)
            sample_seq_len = self.gc_dist[closest_gc].keys()[0]
            d = self.gc_dist[closest_gc][sample_seq_len]
            gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0)
            gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0)

            td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per)

            for scaffold_id in scaffold_ids:
                stats = scaffold_stats.stats[scaffold_id]

                # find GC and TD bounds
                closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), stats.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(self.td_dist.keys(), stats.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (stats.gc - gs.mean_gc) / 100.0
                delta_td = genomic_signature.manhattan(stats.signature, gs.mean_signature)

                # determine if scaffold is an outlier
                outlying_dists = []
                if delta_gc < gc_lower_bound or delta_gc > gc_upper_bound:
                    outlying_dists.append('GC')

                if delta_td > td_bound:
                    outlying_dists.append('TD')

                corr_r = 1.0
                if len(gs.mean_coverage) > 1:
                    corr_r, _corr_p = pearsonr(gs.mean_coverage, stats.coverage)
                    if  corr_r < cov_corr:
                        outlying_dists.append('COV_CORR')

                mean_cp = []
                for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, stats.coverage):
                    if cov_genome >= self.min_required_coverage:
                        mean_cp.append(abs(cov_scaffold - cov_genome) * 100.0 / cov_genome)

                if len(mean_cp) == 0:
                    # genome has zero coverage which is general
                    # will indicate something is wrong
                    mean_cp = -1
                    outlying_dists.append('COV_PERC')
                else:
                    mean_cp = np_mean(mean_cp)
                    if mean_cp > cov_perc:
                        outlying_dists.append('COV_PERC')

                # report outliers
                if (report_type == 'any' and len(outlying_dists) >= 1) or (report_type == 'all' and len(outlying_dists) >= 3):
                    fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, stats.length, ','.join(outlying_dists)))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (stats.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(stats.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp))
                    fout.write('\n')

        sys.stdout.write('\n')
        fout.close()
    def _percent_correct_plot(self, rel_dists, taxa_for_dist_inference, output_prefix):
        """Create plots showing correctly classified taxa for different relative distance values.

        Parameters
        ----------
        rel_dists : d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to consider when inferring relative divergence thresholds.
        output_prefix : str
            Prefix for plots.
        """

        print ''
        print '  Relative divergence thresholds (rank, threshold, parent taxa, child taxa):'

        ranks = sorted(rel_dists.keys())
        rel_dist_thresholds = []
        for i in xrange(ranks[0], ranks[-1]):
            parent_rank = i
            child_rank = i + 1

            # determine classification results for relative divergence
            # values between the medians of adjacent taxonomic ranks
            parent_rds = []
            for taxa, rd in rel_dists[parent_rank].iteritems():
                if taxa in taxa_for_dist_inference:
                    parent_rds.append(rd)
            parent_p50 = np_percentile(parent_rds, 50)

            child_rds = []
            for taxa, rd in rel_dists[child_rank].iteritems():
                if taxa in taxa_for_dist_inference:
                    child_rds.append(rd)

            child_p50 = np_percentile(child_rds, 50)

            r = []
            y_parent = []
            y_child = []
            y_mean_corr = []
            for test_r in np_linspace(parent_p50, child_p50, 100):
                parent_cor = float(sum([1 for rd in parent_rds if rd <= test_r])) / len(parent_rds)
                child_cor = float(sum([1 for rd in  child_rds if rd > test_r])) / len(child_rds)

                r.append(test_r)
                y_parent.append(parent_cor)
                y_child.append(child_cor)
                y_mean_corr.append(0.5 * parent_cor + 0.5 * child_cor)

            # create plot of correctly classified taxa
            self.fig.clear()
            self.fig.set_size_inches(6, 6)
            ax = self.fig.add_subplot(111)

            ax.plot(r, y_parent, 'k--', label=Taxonomy.rank_labels[i])
            ax.plot(r, y_child, 'k:', label=Taxonomy.rank_labels[i + 1])
            ax.plot(r, y_mean_corr, 'r-', label='mean')

            legend = ax.legend(loc='upper left')
            legend.draw_frame(False)

            # find maximum of mean correct classification
            max_mean = max(y_mean_corr)
            r_max_values = [r[i] for i, rd in enumerate(y_mean_corr) if rd == max_mean]
            r_max_value = np_mean(r_max_values)  # Note: this will fail if there are multiple local maxima
            print '    %s\t%.3f\t%d\t%d' % (Taxonomy.rank_labels[parent_rank], r_max_value, len(parent_rds), len(child_rds))

            # check that there is a single local maximum
            rd_indices = [i for i, rd in enumerate(y_mean_corr) if rd == max_mean]
            for rd_index in xrange(0, len(rd_indices) - 1):
                if rd_indices[rd_index] != rd_indices[rd_index + 1] - 1:
                    print '[Warning] There are multiple local maxima, so estimated relative divergence threshold will be invalid.'

            rel_dist_thresholds.append(r_max_value)

            y_min, _y_max = ax.get_ylim()
            ax.axvline(x=r_max_value, ymin=0, ymax=1, color='r', ls='--')
            ax.text(r_max_value + 0.001, y_min + 0.01, '%.3f' % r_max_value, horizontalalignment='left')

            ax.set_xlabel('relative distance')
            ax.set_ylabel('% taxa correctly classified')

            self.prettify(ax)

            self.fig.tight_layout(pad=1)
            self.fig.savefig(output_prefix + '.%s_%s.png' % (Taxonomy.rank_labels[parent_rank], Taxonomy.rank_labels[child_rank]), dpi=96)

        print ''

        return rel_dist_thresholds
Beispiel #47
0
    def optimal(self, input_tree, 
                        rank,
                        min_dist, 
                        max_dist, 
                        step_size,
                        output_table):
        """Determine branch length for best congruency with existing taxonomy.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rank : int
            Taxonomic rank to consider (1=Phylum, ..., 6=Species).
        output_table : str
            Name of output table.
        """
    
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)
        
        # get mean distance to terminal taxa for each node along with
        # other stats needed to determine classification
        self.logger.info('Determining MDTT for each node.')
        rank_prefix = Taxonomy.rank_prefixes[rank]
        child_rank_prefix = Taxonomy.rank_prefixes[rank+1]
        rank_info = []
        rank_dists = set()                                
        for node in tree.seed_node.preorder_internal_node_iter():
            if node == tree.seed_node:
                continue
                
            # check if node is at the specified rank
            node_taxon = None
            if node.label:
                support, taxon_name, _auxiliary_info = parse_label(node.label)
                
                if taxon_name:
                    for taxon in [x.strip() for x in taxon_name.split(';')]:
                        if taxon.startswith(rank_prefix):
                            node_taxon = taxon
                        
            if not node_taxon:
                continue
                
            # check that node has two descendants at the next rank
            child_rank_taxa = []
            for c in node.levelorder_iter():
                if c.label:
                    support, taxon_name, _auxiliary_info = parse_label(c.label)
                    
                    if taxon_name:
                        for taxon in [x.strip() for x in taxon_name.split(';')]:
                            if taxon.startswith(child_rank_prefix):
                                child_rank_taxa.append(taxon)
                            
                if len(child_rank_taxa) >= 2:
                    break
                    
            if len(child_rank_taxa) < 2:
                continue
                
            # get mean branch length to terminal taxa
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))
                
            node_dist = np_mean(dists_to_tips)
            
            # get mean branch length to terminal taxa for first ancestor spanning multiple phyla
            ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix)
            
            ancestor_dists_to_tips = []
            for t in ancestor.leaf_iter():
                ancestor_dists_to_tips.append(self._dist_to_ancestor(t, ancestor))
                
            ancestor_dist = np_mean(ancestor_dists_to_tips)
                    
            rank_info.append([node_dist, ancestor_dist, node_taxon])
            rank_dists.add(node_dist)
            
        self.logger.info('Calculating threshold from %d taxa with specified rank resolution.' % len(rank_info))
            
        fout = open('bl_optimal_taxa_dists.tsv' , 'w')
        fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n')
        for node_dist, ancestor_dist, node_taxon in rank_info:
            fout.write('%s\t%.3f\t%.3f\n' % (node_taxon, node_dist, ancestor_dist))
        fout.close()
                    
        # report number of correct and incorrect taxa for each threshold
        fout = open(output_table, 'w')
        header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages'
        fout.write(header + '\n')
        print header
        
        top_correct = 0
        top_incorrect = 0
        top_precision = 0
        for d in np_arange(min_dist, max_dist+step_size, step_size):
            rank_dists.add(d)
            
        for dist_threshold in sorted(rank_dists, reverse=True):
            correct = 0
            incorrect = 0
            for node_dist, ancestor_dist, node_taxon in rank_info:
                # check if node/edge would be collapsed at the given threshold
                if node_dist <= dist_threshold and ancestor_dist > dist_threshold:
                    correct += 1
                elif node_dist > dist_threshold:
                    incorrect += 1
                else:
                    incorrect += 1 # above ancestor with multiple taxa
         
            denominator = correct + incorrect
            if denominator:
                precision = float(correct) / denominator
            else:
                precision = 0
                
            num_lineages, num_terminal_lineages = self._num_lineages(tree, dist_threshold)
                    
            row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % (dist_threshold, 
                                                            correct, 
                                                            incorrect, 
                                                            precision,
                                                            num_lineages + num_terminal_lineages,
                                                            num_lineages, 
                                                            num_terminal_lineages)
                                                            
            fout.write(row + '\n')
            print row
            
            if precision > top_precision:
                top_correct = correct
                top_incorrect = incorrect
                top_precision = precision
                top_threshold = dist_threshold
                
        return top_threshold, top_correct, top_incorrect
Beispiel #48
0
    def compatible(self, scaffolds_of_interest,
                        scaffold_stats,
                        genome_stats,
                        gc_per, td_per,
                        cov_corr, cov_perc,
                        report_type, output_file):
        """Identify scaffolds with compatible genomic characteristics.

        Compatible scaffolds are identified based on GC content,
        tetranucleotide signatures, coverage profile correlation, and
        mean absolute percent error of coverage profile. The coverage correlation
        check is ignored if the coverage profile consists of a single value.

        Parameters
        ----------
        scaffolds_of_interest : d[scaffold_id] -> [no. genes, perc. genes with homology]
            Scaffolds to consider for compatibility.
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds to check.
        genome_stats : GenomeStats
            Statistics for individual genomes.
        gc_per : int
            Percentile for identifying GC outliers.
        td_per : int
            Percentile for identifying TD outliers.
        cov_corr : int
            Correlation for identifying divergent coverage profiles.
        cov_perc : int
            Mean absolute percent error for identifying divergent coverage profiles.
        report_type : str
            Report scaffolds that are outliers in 'all' or 'any' distribution.
        output_file : str
            Name of output file.
        """

        # read reference distributions from file
        self.logger.info('')
        self.logger.info('  Reading reference distributions.')
        self.gc_dist = self._read_distribution('gc_dist')
        self.td_dist = self._read_distribution('td_dist')

        # identify compatible scaffolds in each genome
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tCompatible distributions')
        fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per))
        fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per)
        fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error')
        fout.write('\t# genes\t% genes with homology\n')

        genomic_signature = GenomicSignature(0)

        self.logger.info('  Identifying scaffolds compatible with bins.')
        processed_scaffolds = 0
        for scaffold_id, ss in scaffold_stats.stats.iteritems():
            processed_scaffolds += 1
            sys.stdout.write('    Processed %d of %d (%.1f%%) scaffolds.\r' % (processed_scaffolds,
                                                                         len(scaffold_stats.stats),
                                                                         processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
            sys.stdout.flush()

            if scaffold_id not in scaffolds_of_interest:
                continue

            for genome_id, gs in genome_stats.iteritems():
                # find keys into GC and TD distributions
                # gc -> [mean GC][scaffold length][percentile]
                # td -> [scaffold length][percentile]
                closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0)
                sample_seq_len = self.gc_dist[closest_gc].keys()[0]
                d = self.gc_dist[closest_gc][sample_seq_len]
                gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0)
                gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0)

                td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per)

                # find GC and TD bounds
                closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), ss.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(self.td_dist.keys(), ss.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (ss.gc - gs.mean_gc) / 100.0
                delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature)

                # determine if scaffold compatible
                compatible_dists = []
                if delta_gc >= gc_lower_bound and delta_gc <= gc_upper_bound:
                    compatible_dists.append('GC')

                if delta_td <= td_bound:
                    compatible_dists.append('TD')

                corr_r = 1.0
                if len(gs.mean_coverage) > 1:
                    corr_r, _corr_p = pearsonr(gs.mean_coverage, ss.coverage)
                    if  corr_r >= cov_corr:
                        compatible_dists.append('COV_CORR')

                mean_cp = []
                for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, ss.coverage):
                    if cov_genome >= self.min_required_coverage:
                        mean_cp.append(abs(cov_genome - cov_scaffold) * 100.0 / cov_genome)

                mean_cp = np_mean(mean_cp)
                if mean_cp <= cov_perc:
                    compatible_dists.append('COV_PERC')

                # report compatible scaffolds
                if (report_type == 'any' and len(compatible_dists) >= 1) or (report_type == 'all' and len(compatible_dists) >= 3):
                    fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, ss.length, ','.join(compatible_dists)))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (ss.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(ss.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp))
                    fout.write('\t%d\t%.1f' % (scaffolds_of_interest[scaffold_id][0], scaffolds_of_interest[scaffold_id][1]))
                    fout.write('\n')

        sys.stdout.write('\n')
        fout.close()
Beispiel #49
0
    def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file, taxonomy_file, output_prefix, min_children, title):

        # determine named clades in full tree
        named_clades = set()
        tree = dendropy.Tree.get_from_path(full_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        for node in tree.preorder_node_iter():
            if node.label:
                taxonomy = node.label.split(';')
                named_clades.add(taxonomy[-1].strip().split(':')[-1])

        print 'Identified %d named clades in full tree.' % len(named_clades)

        # determine named groups with at least the specified number of children
        print 'Determining taxa with sufficient named children lineages.'
        taxon_children = defaultdict(set)
        groups = defaultdict(list)
        print taxonomy_file
        for line in open(taxonomy_file):
            line_split = line.replace('; ', ';').split()
            genome_id = line_split[0]
            taxonomy = [x.strip() for x in line_split[1].split(';')]

            if len(taxonomy) > rank + 1:
                taxon_children[taxonomy[rank]].add(taxonomy[rank + 1])

            if len(taxonomy) > rank:
                groups[taxonomy[rank]].append(genome_id)

        groups_to_consider = set()
        for taxon, children_taxa in taxon_children.iteritems():
            if len(children_taxa) >= min_children and taxon in named_clades:
                groups_to_consider.add(taxon)

        print 'Assessing distribution over %d groups.' % len(groups_to_consider)

        # calculate relative distance for full tree
        print ''
        print 'Calculating relative distance over full tree.'
        tree = dendropy.Tree.get_from_path(full_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)
        if len(polyphyletic) > 0:
            print ''
            print '[Warning] Full tree contains polyphyletic groups.'

        # calculate relative distance for dereplicated tree
        print ''
        print 'Calculating relative distance over dereplicated tree.'
        tree = dendropy.Tree.get_from_path(derep_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)

        groups_to_consider = groups_to_consider - polyphyletic
        print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len(groups_to_consider)

        # calculate relative distance to each group in each tree
        print ''
        rel_dists = defaultdict(list)
        dist_components = defaultdict(list)
        for f in os.listdir(input_tree_dir):
            if not f.endswith('.rooted.tree'):
                continue

            print f

            tree_file = os.path.join(input_tree_dir, f)
            tree = dendropy.Tree.get_from_path(tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

            # calculate relative distance to named taxa
            rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)

            for taxon, dist in rel_dist.iteritems():
                rel_dists[taxon].append(dist)
                dist_components[taxon].append(components[taxon])

        # create scatter plot
        x = []
        y = []
        xDerep = []
        yDerep = []
        xFull = []
        yFull = []
        perc10 = []
        perc90 = []
        labels = []
        fout = open(output_prefix + '.tsv', 'w')
        fout.write('Taxon\tP10\tP90\tP90-P10\tMean rel. dist\tMean dist to parent\tMean dist to leaves\tOriginal rel. dist.\tOrigial dist to parent\tOriginal dist to leaves\n')
        for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)):
            labels.append(taxon + ' (%d)' % (len(rel_dists[taxon])))

            rd = rel_dists[taxon]
            for d in rd:
                x.append(d)
                y.append(i + 0.2)

            p10, p90 = np_percentile(rd, [10, 90])
            perc10.append(p10)
            perc90.append(p90)

            print taxon, p90 - p10
            mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0)
            derep_x, derep_a, derep_b = derep_dist_components[taxon]
            fout.write('%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' % (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x, derep_a, derep_b))

            xDerep.append(derep_rel_dist[taxon])
            yDerep.append(i)

            xFull.append(full_rel_dist[taxon])
            yFull.append(i)
        fout.close()

        self.fig.clear()
        self.fig.set_size_inches(8, len(rel_dists) * 0.4)
        ax = self.fig.add_subplot(111)

        ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s')
        ax.scatter(xDerep, yDerep, alpha=1.0, s=24, c=(1.0, 0.0, 0.0), marker='s')
        ax.scatter(xFull, yFull, alpha=1.0, s=24, c=(0.0, 0.0, 1.0), marker='*')

        for i in xrange(len(labels)):
            ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-')
            ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-')

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')
        if title:
            ax.set_title(title, size=12)

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('taxa')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(labels)

        self.prettify(ax)

        # make plot interactive
        # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12))

        # mpld3.save_html(fig, output_prefix + '.html')
        self.fig.tight_layout(pad=1)
        self.fig.savefig(output_prefix + '.png', dpi=300)
Beispiel #50
0
    def table(self, input_tree, taxon_category_file, bl_step_size, output_table):
        """Produce table with number of lineage for increasing mean branch lengths

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        taxon_category_file : str
            File indicating category for each taxon in the tree.
        bl_step_size : float
            Step size in table for mean branch length criterion.
        output_table : str
            Name of output table.
        """
        
        # get category for each taxon
        taxon_category = {}
        for line in open(taxon_category_file):
            line_split = line.strip().split('\t')
            taxon_category[line_split[0]] = line_split[1]

        # read tree
        tree = dendropy.Tree.get_from_path(input_tree,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)
        
        # determine mean distance to leaves and taxon categories for each node
        all_categories = set()
        node_info = {}
        parent_mean_dist_to_leafs = {}
        max_bl_threshold = None
        for i, node in enumerate(tree.seed_node.preorder_iter()):
            node.id = i
            
            if node.is_leaf():
                mean_dist_to_leafs = 0.0
                categories = set()
                for c in taxon_category[node.taxon.label].split('/'):
                    categories.add(c)
            else:
                dist_to_leafs = []
                categories = set()
                for t in node.leaf_iter():
                    dist_to_leafs.append(self._dist_to_ancestor(t, node))
                    
                    for c in taxon_category[t.taxon.label].split('/'):
                        categories.add(c)

                mean_dist_to_leafs = np_mean(dist_to_leafs)
                
            if node.parent_node:
                p = parent_mean_dist_to_leafs[node.parent_node.id]
            else:
                p = mean_dist_to_leafs + 1e-6

            category = '/'.join(sorted(list(categories), reverse=True))
            all_categories.add(category)
            node_info[node.id] = [mean_dist_to_leafs, p, category] 
            parent_mean_dist_to_leafs[node.id] = mean_dist_to_leafs
            
            if mean_dist_to_leafs > max_bl_threshold:
                max_bl_threshold = mean_dist_to_leafs
            
        # write table
        fout = open(output_table, 'w')
        fout.write('Threshold')
        for c in all_categories:
            fout.write('\t%s' % c)
        fout.write('\n')
        
        for bl_threshold in np_arange(0, max_bl_threshold + bl_step_size, bl_step_size):
            category_count = defaultdict(int)
            
            stack = [tree.seed_node]
            while stack:
                node = stack.pop()
                
                mean_dist_to_leafs, _, category = node_info[node.id]
                if mean_dist_to_leafs > bl_threshold:
                    for c in node.child_node_iter():
                        stack.append(c)
                else:
                    category_count[category] += 1
                                  
            # check if node meets mean branch length criterion
            if sum(category_count.values()) > 0:
                fout.write('%.3f' % bl_threshold)
                for c in all_categories:
                    fout.write('\t%d' % category_count[c])
                fout.write('\n')
                
        fout.close()
   
        if False:
            node_info.sort()
            for bl_threshold in np_arange(0, node_info[-1][0] + bl_step_size, bl_step_size):
                category_count = defaultdict(int)
                for mean_bl_dist, parent_mean_bl_dist, category in node_info:
                    if bl_threshold >= mean_bl_dist and bl_threshold < parent_mean_bl_dist:
                        category_count[category] += 1
                        
                if sum(category_count.values()) > 0:
                    fout.write('%.3f' % bl_threshold)
                    for c in all_categories:
                        fout.write('\t%d' % category_count[c])
                    fout.write('\n')
Beispiel #51
0
    def decorate(self, 
                    input_tree,
                    taxonomy_file,
                    threshold, 
                    rank, 
                    retain_named_lineages, 
                    keep_labels,
                    prune,
                    output_tree):
        """Produce table with number of lineage for increasing mean branch lengths

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        taxonomy_file : str
            File with taxonomic information for each taxon.
        threshold : float
            Branch length threshold.
        rank : int
            Rank of labels to retain on tree.
        retain_named_lineages : bool
            Retain existing named lineages at the specified rank.
        keep_labels : bool
            Keep existing labels on tree.
        prune : bool
            Prune tree to preserve only the shallowest and deepest taxa in each lineage.
        output_tree : str
            Name of output tree.
        """
        
        # read taxonomy
        taxonomy = Taxonomy().read(taxonomy_file)
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)
        
        # decorate tree
        rank_prefix = Taxonomy.rank_prefixes[rank]
        new_name_number = defaultdict(int)
        ncbi_only = 0
        sra_only = 0
        
        labeled_nodes = set()
        
        stack = [tree.seed_node]
        while stack:
            node = stack.pop()
            
            # check if node is a leaf
            if node.is_leaf():
                continue
                
            # check if ancestor already has a label at this rank
            p = node
            parent_taxon = None
            while p and not parent_taxon:
                if p.label:
                    support, taxon_name, _auxiliary_info = parse_label(p.label)
                    
                    if taxon_name:
                        for taxon in [x.strip() for x in taxon_name.split(';')]:
                            if taxon.startswith(rank_prefix):
                                parent_taxon = taxon
                    
                p = p.parent_node
                    
            if retain_named_lineages and parent_taxon:
                for c in node.child_node_iter():
                    stack.append(c)
                continue
                
            # check if descendant node already has a label at this rank
            children_taxon = []
            for c in node.preorder_internal_node_iter():
                if c.label:
                    support, taxon_name, _auxiliary_info = parse_label(c.label)
                    
                    if taxon_name:
                        for taxon in [x.strip() for x in taxon_name.split(';')]:
                            if taxon.startswith(rank_prefix):
                                children_taxon.append(taxon)
                        
            if retain_named_lineages and children_taxon:
                for c in node.child_node_iter():
                    stack.append(c)
                continue
                
            # check if node meets mean branch length criterion
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))
                
            if np_mean(dists_to_tips) > threshold:
                for c in node.child_node_iter():
                    stack.append(c)
                continue
                                
            # count number of SRA and NCBI taxa below node
            num_sra_taxa = 0
            num_ncbi_taxa = 0
            taxa_labels = set()
            for t in node.leaf_iter():
                if t.taxon.label.startswith('U_'):
                    num_sra_taxa += 1
                else:
                    num_ncbi_taxa += 1
                    
                t = taxonomy[t.taxon.label]
                taxon = t[rank][3:].replace('Candidatus ', '')
                if taxon:
                    taxa_labels.add(taxon)
                    
            if parent_taxon:
                taxa_labels.add(parent_taxon[3:].replace('Candidatus ', ''))
            elif children_taxon:
                for c in children_taxon:
                    taxa_labels.add(c[3:].replace('Candidatus ', ''))
            
                    
            # name lineage based on position to existing named lineages
            if taxa_labels:
                lineage_name = ', '.join(sorted(taxa_labels))
            else:
                lineage_name = 'Unclassified lineage'
            
            support = None
            taxon_name = None
            if node.label: # preserve support information
                support, _taxon_name, _auxiliary_info = parse_label(node.label)

            new_name_number[lineage_name] += 1

            if support:
                node.label = '%d:%s %d' % (support, lineage_name, new_name_number[lineage_name])
            else:    
                node.label = '%s %d' % (lineage_name, new_name_number[lineage_name])
                                                        
            labeled_nodes.add(node)
                 
            if num_sra_taxa == 0:
                ncbi_only += 1
            if num_ncbi_taxa == 0:
                sra_only += 1
                
        # strip previous labels
        if not keep_labels:
            for node in tree.preorder_internal_node_iter():
                if node in labeled_nodes:
                    continue
                    
                if node.label: # preserve support information
                    support, _taxon_name, _auxiliary_info = parse_label(node.label)
                    node.label = support
                    
        # prune tree to shallowest and deepest taxa in each named lineage
        if prune:
            nodes_to_prune = set()
            for node in labeled_nodes:
                for c in node.child_node_iter():
                    dists = []
                    for t in c.leaf_iter():
                        d = self._dist_to_ancestor(t, node)
                        dists.append((d, t))
                    
                    dists.sort()
                    
                    # select taxa at the 10th and 90th percentiles to
                    # give a good sense of the range of depths
                    perc_10th_index = int(0.1 * len(dists) + 0.5)
                    perc_90th_index = int(0.9 * len(dists) + 0.5)
                    for i, (d, t) in enumerate(dists):
                        if i != perc_10th_index and i != perc_90th_index:
                            nodes_to_prune.add(t.taxon)
                
            print 'before prune', sum([1 for _ in tree.leaf_node_iter()])
            tree.prune_taxa(nodes_to_prune)
            print 'after prune', sum([1 for _ in tree.leaf_node_iter()])
                        
        self.logger.info('Decorated %d internal nodes.' % sum(new_name_number.values()))
        #self.logger.info('NCBI-only %d; SRA-only %d' % (ncbi_only, sra_only))
        
        tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
Beispiel #52
0
def stats_for_qlp_well(well, compute_clusters=False, override_thresholds=None):
    """
    Return statistics about a QLWell object read from a QLP file.
    The QLWell object should have a populated `peaks` attribute (reading from QLBs won't work)

    For parameter explanations and return values, see :func:`stats_for_qlp_well`.
    """
    from pyqlb.nstats.peaks import cluster_1d, channel_amplitudes
    from pyqlb.nstats.well import accepted_peaks, above_min_amplitude_peaks, well_channel_sp_values, well_cluster_peaks
    from pyqlb.nstats.well import well_observed_positives_negatives, well_s2d_values, getClusters
    from pyqlb.nstats.well import high_flier_droplets, low_flier_droplets, singleRain_droplets, doubleRain_droplets, diagonal_scatter
    from numpy import mean as np_mean, std as np_std

    if not override_thresholds:
        override_thresholds = (None, None)

    statistics = well_statistics(well, override_thresholds=override_thresholds)
    accepted = len(accepted_peaks(well))
    num_above_min = len(above_min_amplitude_peaks(well))

    if num_above_min > 0 and accepted > 0:
        if well.sum_amplitude_bins:
            peaksets, boundaries, amps = revb_polydisperse_peaks(well, 0, threshold=override_thresholds[0])
            poly_peaks = sum([len(p) for p in peaksets])
            statistics[0].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min
        else:
            peaksets, boundaries, width_gates = polydisperse_peaks(well, 0, threshold=override_thresholds[0])
            poly_peaks = sum([len(p) for p in peaksets])
            statistics[0].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min
    else:
        statistics[0].revb_polydispersity_pct = 0

    s, p_plus, p, p_minus = well_channel_sp_values(well, 0, override_threshold=override_thresholds[0])
    statistics[0].s_value = s
    statistics[0].p_plus = p_plus
    statistics[0].p_plus_drops = int(p_plus*accepted) if p_plus is not None else None
    statistics[0].p = p
    statistics[0].p_drops = int(p*accepted) if p is not None else None
    statistics[0].p_minus = p_minus
    statistics[0].p_minus_drops = int(p_minus*accepted) if p_minus is not None else None

    if num_above_min > 0 and accepted > 0:
        if well.sum_amplitude_bins:
            peaksets, boundaries, amps = revb_polydisperse_peaks(well, 1, threshold=override_thresholds[1])
            poly_peaks = sum([len(p) for p in peaksets])
            statistics[1].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min
        else:
            peaksets, boundaries, width_gates = polydisperse_peaks(well, 1, threshold=override_thresholds[1])
            poly_peaks = sum([len(p) for p in peaksets])
            statistics[1].revb_polydispersity_pct = 100*float(poly_peaks)/num_above_min
    else:
        statistics[1].revb_polydispersity_pct = 0

    s, p_plus, p, p_minus = well_channel_sp_values(well, 1, override_threshold=override_thresholds[1])
    statistics[1].s_value = s
    statistics[1].p_plus = p_plus
    statistics[1].p_plus_drops = int(p_plus*accepted) if p_plus is not None else None
    statistics[1].p = p
    statistics[1].p_drops = int(p*accepted) if p is not None else None
    statistics[1].p_minus = p_minus
    statistics[1].p_minus_drops = int(p_minus*accepted) if p_minus is not None else None

    ## compute s2d plots
    s2d_vals = well_s2d_values( well, thresholds=override_thresholds)
    statistics[0].s2d_value = s2d_vals[0] if s2d_vals is not None else None
    statistics[1].s2d_value = s2d_vals[1] if s2d_vals is not None else None

    ## compute extra cluster metrics
    clusters = getClusters( well, override_thresholds )
    dscatter = diagonal_scatter( clusters )
    statistics.diagonal_scatter = dscatter[1] if dscatter is not None else None
    statistics.diagonal_scatter_pct  = dscatter[2] *100 if dscatter is not None else None
    for channel in [0,1]:
        high_fliers = high_flier_droplets( clusters, channel )
        statistics[channel].high_flier_value = high_fliers[1] if high_fliers is not None else None
        statistics[channel].high_flier_pct = high_fliers[2] * 100 if high_fliers is not None else None

        low_fliers  = low_flier_droplets( clusters, channel )
        statistics[channel].low_flier_value  = low_fliers[1] if low_fliers is not None else None
        statistics[channel].low_flier_pct    = low_fliers[2] * 100 if low_fliers is not None else None
        
        singleRain  = singleRain_droplets( clusters, channel )
        statistics[channel].single_rain_value  = singleRain[1] if singleRain is not None else None
        statistics[channel].single_rain_pct  = singleRain[2] * 100 if singleRain is not None else None
        
        doubleRain  = doubleRain_droplets( clusters, channel )
        statistics[channel].double_rain_value = doubleRain[1] if doubleRain is not None else None
        statistics[channel].double_rain_pct = doubleRain[2] * 100 if doubleRain is not None else None


    if compute_clusters:
        clusters = well_cluster_peaks(well, override_thresholds)
    else:
        clusters = {'positive_peaks': {'positive_peaks': [], 'negative_peaks': []},
                    'negative_peaks': {'positive_peaks': [], 'negative_peaks': []}}
 
    # cheap hack
    statistics.alg_version = "%s.%s/%s.%s" % (well.statistics.peak_alg_major_version,
                                              well.statistics.peak_alg_minor_version,
                                              well.statistics.quant_alg_major_version,
                                              well.statistics.quant_alg_minor_version)
    statistics.ref_copy_num = well.ref_copy_num
    statistics[0].decision_tree = well.channels[0].decision_tree_verbose
    statistics[1].decision_tree = well.channels[1].decision_tree_verbose
    # end cheap hack

    # SNR
    for chan in (0,1):
        if override_thresholds[chan]:
            # TODO add this to pyqlb.nstats.well instead
            pos, neg = cluster_1d(accepted_peaks(well), chan, override_thresholds[chan])
        else:
            pos, neg, unknown = well_observed_positives_negatives(well, chan)

        for attr, coll in (('positive_snr', pos),('negative_snr',neg)):
            if len(pos) > 0:
                amps = channel_amplitudes(coll, chan)
                amp_mean = np_mean(amps)
                amp_std = np_std(amps)
                if amp_std > 0:
                    setattr(statistics[chan], attr, amp_mean/amp_std)
                else:
                    setattr(statistics[chan], attr, 10000)
            else:
                setattr(statistics[chan], attr, 0)

    for channel in [0,1]:
        means,stds = total_events_amplitude_vals(well,channel) 
        statistics[channel].total_events_amplitude_mean = means if means is not None else None
        statistics[channel].total_events_amplitude_stdev = stds if stds is not None else None

    return statistics, clusters
    def get_value_for_data_only(self, values):
        """
        Returns the mean, standard deviation and number of values
        """

        return np_mean(values), np_std(values, ddof=1), np.size(values)
Beispiel #54
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
 def get_value_for_data_only(self, values):
     """
     return the mean
     """
     return np_mean(values)
Beispiel #56
0
    def loadData(self,
                 timer,
                 condition,                 # condition as set by another function
                 bids=[],                   # if this is set then only load those contigs with these bin ids
                 verbose=True,              # many to some output messages
                 silent=False,              # some to no output messages
                 loadCovProfiles=True,
                 loadKmerPCs=True,
                 loadKmerVarPC=True,
                 loadRawKmers=False,
                 makeColors=True,
                 loadContigNames=True,
                 loadContigLengths=True,
                 loadContigGCs=True,
                 loadBins=False,
                 loadLinks=False):
        """Load pre-parsed data"""

        timer.getTimeStamp()
        if(silent):
            verbose=False
        if verbose:
            print "Loading data from:", self.dbFileName

        try:
            self.numStoits = self.getNumStoits()
            self.condition = condition
            self.indices = self.dataManager.getConditionalIndices(self.dbFileName,
                                                                  condition=condition,
                                                                  silent=silent)
            if(verbose):
                print "    Loaded indices with condition:", condition
            self.numContigs = len(self.indices)

            if self.numContigs == 0:
                print "    ERROR: No contigs loaded using condition:", condition
                return

            if(not silent):
                print "    Working with: %d contigs" % self.numContigs

            if(loadCovProfiles):
                if(verbose):
                    print "    Loading coverage profiles"
                self.covProfiles = self.dataManager.getCoverageProfiles(self.dbFileName, indices=self.indices)
                self.normCoverages = self.dataManager.getNormalisedCoverageProfiles(self.dbFileName, indices=self.indices)

                # work out average coverages
                self.averageCoverages = np_array([sum(i)/self.numStoits for i in self.covProfiles])

            if loadRawKmers:
                if(verbose):
                    print "    Loading RAW kmer sigs"
                self.kmerSigs = self.dataManager.getKmerSigs(self.dbFileName, indices=self.indices)

            if(loadKmerPCs):
                self.kmerPCs = self.dataManager.getKmerPCAs(self.dbFileName, indices=self.indices)

                if(verbose):
                    print "    Loading PCA kmer sigs (" + str(len(self.kmerPCs[0])) + " dimensional space)"

                self.kmerNormPC1 = np_copy(self.kmerPCs[:,0])
                self.kmerNormPC1 -= np_min(self.kmerNormPC1)
                self.kmerNormPC1 /= np_max(self.kmerNormPC1)

            if(loadKmerVarPC):
                self.kmerVarPC = self.dataManager.getKmerVarPC(self.dbFileName, indices=self.indices)

                if(verbose):
                    print "    Loading PCA kmer variance (total variance: %.2f" % np_sum(self.kmerVarPC) + ")"

            if(loadContigNames):
                if(verbose):
                    print "    Loading contig names"
                self.contigNames = self.dataManager.getContigNames(self.dbFileName, indices=self.indices)

            if(loadContigLengths):
                self.contigLengths = self.dataManager.getContigLengths(self.dbFileName, indices=self.indices)
                if(verbose):
                    print "    Loading contig lengths (Total: %d BP)" % ( sum(self.contigLengths) )

            if(loadContigGCs):
                self.contigGCs = self.dataManager.getContigGCs(self.dbFileName, indices=self.indices)
                if(verbose):
                    print "    Loading contig GC ratios (Average GC: %0.3f)" % ( np_mean(self.contigGCs) )

            if(makeColors):
                if(verbose):
                    print "    Creating color map"

                # use HSV to RGB to generate colors
                S = 1       # SAT and VAL remain fixed at 1. Reduce to make
                V = 1       # Pastels if that's your preference...
                self.colorMapGC = self.createColorMapHSV()

            if(loadBins):
                if(verbose):
                    print "    Loading bin assignments"

                self.binIds = self.dataManager.getBins(self.dbFileName, indices=self.indices)

                if len(bids) != 0: # need to make sure we're not restricted in terms of bins
                    bin_stats = self.getBinStats()
                    for bid in bids:
                        try:
                            self.validBinIds[bid] = bin_stats[bid][0]
                            self.isLikelyChimeric[bid]= bin_stats[bid][1]
                        except KeyError:
                            self.validBinIds[bid] = 0
                            self.isLikelyChimeric[bid]= False

                else:
                    bin_stats = self.getBinStats()
                    for bid in bin_stats:
                        self.validBinIds[bid] = bin_stats[bid][0]
                        self.isLikelyChimeric[bid] = bin_stats[bid][1]

                # fix the binned indices
                self.binnedRowIndices = {}
                for i in range(len(self.indices)):
                    if(self.binIds[i] != 0):
                        self.binnedRowIndices[i] = True
            else:
                # we need zeros as bin indicies then...
                self.binIds = np_zeros(len(self.indices))

            if(loadLinks):
                self.loadLinks()

            self.stoitColNames = self.getStoitColNames()

        except:
            print "Error loading DB:", self.dbFileName, exc_info()[0]
            raise
    def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        rel_dist_thresholds: list
            Relative distances cutoffs for defining ranks.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        rel_dist_thresholds += [1.0]  # append boundry for species
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))

            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if clade_label in taxa_for_dist_inference:
                    c.append((0.0, 0.0, 0.5))
                else:
                    c.append((0.5, 0.5, 0.5))

                p10, p50, p90 = percentiles[i]
                percentile_outlier = not (dist >= p10 and dist <= p90)

                if i == 0:
                    rank_cutoff = rel_dist_thresholds[i]
                    rank_outlier = dist > rank_cutoff
                else:
                    rank_cutoff = rel_dist_thresholds[i]
                    upper_rank_cutoff = rel_dist_thresholds[i - 1]
                    rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff)

                v = [clade_label, dist, rank_cutoff, str(rank_outlier)]
                v += percentiles[i] + [str(percentile_outlier)]
                fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        fout.close()

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # plot relative divergence threshold lines
        y_min, y_max = ax.get_ylim()
        for threshold in rel_dist_thresholds[0:-1]:  # don't draw species boundary
            ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--')
            ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center')

        # make plot interactive
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=96)