Python PyRanges.coverage Examples

Programming Language: Python

Namespace/Package Name: pyranges

Class/Type: PyRanges

Method/Function: coverage

Examples at hotexamples.com: 3

Python PyRanges.coverage - 3 examples found. These are the top rated real world Python examples of pyranges.PyRanges.coverage extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PyRanges(30)

join(5)

coverage(3)

Length(2)

nearest(2)

lengths(2)

apply(2)

intersection(2)

df(2)

groupby(1)

set_intersection(1)

overlap(1)

is_const(1)

intersect(1)

CVG(1)

drop_empty(1)

Form(1)

count_overlaps(1)

as_df(1)

SURR(1)

SURL(1)

NLP(1)

MAX_NLP(1)

Location(1)

to_csv(1)

Example #1

Show file

    def calc_windowed_seg_sites(self, chrom=0, L=1e3, filt_rec=True, mask=None):
        """Calculate windowed estimates of segregating sites.

        Arguments:
            * chrom: identifier for the chromosome
            * L: length of independent locus
            * filt_rec: filter recombination
            * mask: bed file for the underlying mask

        """
        assert self.chrom_pos_dict is not None
        phys_pos = self.chrom_physpos_dict[chrom]
        rec_pos = self.chrom_pos_dict[chrom]
        weights = self.chrom_weight_dict[chrom]
        if filt_rec:
            diff = np.abs(rec_pos[:-1] - rec_pos[1:])
            idx = np.where(diff != 0)[0]
            phys_pos = phys_pos[idx]
            rec_pos = rec_pos[idx]
            weights = weights[idx]
        if mask is not None:
            phys_pos = phys_pos.astype(np.float64)
            df_mask = pyranges.read_bed(mask)
            df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1))
            cov_sites = df_pos.coverage(df_mask)
            sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32))
            idx = np.where(sites_idx > 0.0)[0]
            phys_pos[idx] = np.nan
        # 1. Setup the bins for the analysis
        bins = np.arange(np.nanmin(phys_pos), np.nanmax(phys_pos), L)
        windowed_vars, bin_edges = np.histogram(
            phys_pos[~np.isnan(phys_pos)],
            bins=bins,
            weights=weights[~np.isnan(phys_pos)],
        )
        bin_edges = bin_edges.astype(np.uint32)
        # Interpolate the midpoints of the recombination bins
        f = interpolate.interp1d(phys_pos, rec_pos)
        midpts = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2
        rec_midpts = f(midpts)
        # Calculate the weightings from the mask as needed ...
        mask_weights = np.ones(rec_midpts.size)
        if mask is not None:
            # Mask must be a bedfile
            df_windows = PyRanges(
                chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:]
            )
            df_mask = pyranges.read_bed(mask)
            cov = df_windows.coverage(df_mask)
            mask_weights = np.array(cov.FractionOverlaps.astype(np.float32))
            # Set the mask weights to scale up the fraction that may be missing!
            mask_weights = 1.0 / (1.0 - mask_weights)
            mask_weights[np.isinf(mask_weights)] = np.nan

        # Stacking all of the data to make sure that we can use it later on
        tot_data = np.vstack([windowed_vars, bin_edges[1:], rec_midpts, mask_weights])
        self.chrom_total_dict[chrom] = tot_data

Example #2

Show file

    def monte_carlo_corr_SA_SB_v2(
        self, L=1e3, dist=100, nreps=1000, chrom=0, seed=42, filt_rec=True, mask=None
    ):
        """Estimate the correlation using alternative Monte-Carlo Sampling.

        Key: this allows us to test much shorter length scales

        """
        assert self.chrom_physpos_dict is not None
        assert self.chrom_pos_dict is not None
        assert L > 0
        assert dist > 0
        assert seed > 0
        np.random.seed(seed)
        phys_pos = self.chrom_physpos_dict[chrom]
        rec_pos = self.chrom_pos_dict[chrom]
        weights = self.chrom_weight_dict[chrom]
        if filt_rec:
            diff = np.abs(rec_pos[:-1] - rec_pos[1:])
            idx = np.where(diff != 0)[0]
            phys_pos = phys_pos[idx]
            rec_pos = rec_pos[idx]
            weights = weights[idx]
        if mask is not None:
            phys_pos = phys_pos.astype(np.float64)
            df_mask = pyranges.read_bed(mask)
            df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1))
            cov_sites = df_pos.coverage(df_mask)
            sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32))
            idx = np.where(sites_idx > 0.0)[0]
            phys_pos[idx] = np.nan
        # 1. Setup bins separated by some distance
        startp = np.nanmin(phys_pos)
        endp = startp + L
        windowed_vars = []
        bins = []
        while endp < np.nanmax(phys_pos):
            bins.append((startp, endp))
            start = np.searchsorted(phys_pos[~np.isnan(phys_pos)], startp, "left")
            end = np.searchsorted(phys_pos[~np.isnan(phys_pos)], endp, "right")
            # Append this to actually weight the variants
            windowed_vars.append(end - start)
            startp += L + dist
            endp += L + dist
        windowed_vars = np.array(windowed_vars)
        bin_edges = np.array(bins).ravel()
        #         print(bin_edges.size)
        #         print(windowed_vars.size, bin_edges.size)
        assert (bin_edges.size / 2) == windowed_vars.size
        # Interpolate the midpoints of the recombination bins
        f = interpolate.interp1d(phys_pos, rec_pos)
        rec_dist = f(bin_edges[2:-1:2]) - f(bin_edges[1:-2:2])
        #         print(np.mean(rec_dist))
        windowed_vars = windowed_vars[:-1]
        #         print(rec_dist.size, windowed_vars.size)

        # Calculate the weightings from the mask as needed ...
        mask_weights = np.ones(windowed_vars.size)
        if mask is not None:
            # Mask must be a bedfile
            df_windows = PyRanges(
                chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:]
            )
            df_mask = pyranges.read_bed(mask)
            cov = df_windows.coverage(df_mask)
            mask_weights = np.array(cov.FractionOverlaps.astype(np.float32))
            # Set the mask weights to scale up the fraction that may be missing!
            mask_weights = 1.0 / (1.0 - mask_weights)
            mask_weights[np.isinf(mask_weights)] = np.nan

        # add to whatever total datatype that we require?
        windowed_het_weighted = mask_weights * windowed_vars
        #         print(windowed_het_weighted.size)
        s1s = windowed_het_weighted[:-2:2]
        s2s = windowed_het_weighted[1:-1:2]
        #         print(rec_dist.size, s1s.size, s2s.size)
        assert s1s.size == s2s.size
        #         assert ((rec_dist.size  / 2) - 1) == s1s.size
        # Perform the Monte-Carlo resampling here
        idx = np.random.randint(s1s.size, size=nreps)
        s1s_samp = s1s[idx]
        s2s_samp = s2s[idx]
        rec_dist_samp = rec_dist[2 * idx]
        if self.rec_dist is None:
            self.rec_dist = {}
            self.s1 = {}
            self.s2 = {}
        if chrom in self.rec_dist:
            tmp_rec_dist = np.append(self.rec_dist[chrom], rec_dist_samp)
            tmp_s1 = np.append(self.s1[chrom], s1s_samp)
            tmp_s2 = np.append(self.s2[chrom], s2s_samp)
            self.rec_dist[chrom] = tmp_rec_dist
            self.s1[chrom] = tmp_s1
            self.s2[chrom] = tmp_s2
        else:
            self.rec_dist[chrom] = rec_dist_samp
            self.s1[chrom] = s1s_samp
            self.s2[chrom] = s2s_samp

Example #3

Show file

def compute_peaks_and_zscores(cvg, center, left, right, chip, background_sum,
                              ratios, ratio, args):

    print("peaks and zscores")
    all_peaks, zs = _compute_peaks_and_zscores(cvg, center, left, right, chip,
                                               background_sum, ratios, ratio,
                                               args)
    print("peaks and zscores done")

    min_er = args["min_enrichment"]

    peaks_with_info = {}
    for peak_type, peaks in enumerate(all_peaks, 1):

        # print("find max start")
        # print(list(len(v) for v in zs[peak_type - 1].values()))
        # print(peaks)
        # print(zs[peak_type - 1].values())
        # t1 = list(zs[peak_type - 1].values())[0]
        # print(t1)
        # print(max(t1[1]))
        max_zs = {}
        for k, v in zs[peak_type - 1].items():
            max_zs[k] = np.array([max(v2[1]) for v2 in v])

        # max_zs = np.array(max_zs)
        # print("find max end")

        # print("len max_zs:", sum(len(v) for v in max_zs.values()))

        result = {k: -(pnorm(v) / np.log(10)) for k, v in max_zs.items()}
        # print(len(peaks))
        # print(len(np.concatenate([result[k] for k in natsorted(result)])))
        peaks.NLP = np.around(
            np.concatenate([result[k] for k in natsorted(result)]), 3)

        peaks.Location = np.array(np.ceil((peaks.Start + peaks.End) / 2),
                                  dtype=np.long)

        peaks.Type = peak_type

        peaks_loc = PyRanges(seqnames=peaks.Chromosome,
                             starts=peaks.Location,
                             ends=peaks.Location + 1,
                             strands=peaks.Strand)
        loc_cvg = peaks_loc.coverage()

        chip_cvg = loc_cvg * cvg
        bg_cvg = loc_cvg * background_sum

        peak_enrich_cvg_f = 1 + (ratio["+"] * chip_cvg["+"])
        peak_enrich_cvg_r = 1 + (ratio["-"] * chip_cvg["-"])
        peak_enrich_cvg = PyRles({
            k: v
            for k, v in list(peak_enrich_cvg_r.items() +
                             peak_enrich_cvg_f.items())
        })

        peak_enrich_ref = 1 + (bg_cvg)
        peak_enrich = peak_enrich_cvg / peak_enrich_ref

        vals_f = np.concatenate(
            [peak_enrich[k].values for k in peak_enrich["+"].keys()])
        vals_r = np.concatenate(
            [peak_enrich[k].values for k in peak_enrich["-"].keys()])
        vals_f = vals_f[np.isfinite(vals_f)]
        vals_r = vals_r[np.isfinite(vals_r)]

        # print(len(vals_f))
        vals_f = vals_f[vals_f > 1]
        vals_r = vals_r[vals_r > 1]

        if peak_type == 1:
            min_er_f = np.percentile(vals_f, min_er * 100)
            min_er_r = np.percentile(vals_r, min_er * 100)

        vals_f = vals_f > min_er_f
        vals_r = vals_r > min_er_r

        # print(np.sum(vals_f))
        # print(len(vals_f))
        # print(peaks["+"])

        peaks["+"].Enrichment = vals_f
        peaks["-"].Enrichment = vals_r

        peaks_loc["+"].Enrichment = vals_f
        peaks_loc["-"].Enrichment = vals_r

        peaks = peaks.apply(
            lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1))
        peaks_loc = peaks_loc.apply(
            lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1))
        peaks_loc.Start += 1
        peaks_loc.End += 1

        chip_cvg = np.array(np.concatenate([
            cvg[k][peaks[k].Location] for k in cvg.keys()
            if not peaks[k].empty()
        ]),
                            dtype=np.long)
        left_cvg = np.array(np.concatenate([
            left[k][peaks[k].Location] for k in left.keys()
            if not peaks[k].empty()
        ]),
                            dtype=np.long)
        right_cvg = np.array(np.concatenate([
            right[k][peaks[k].Location] for k in right.keys()
            if not peaks[k].empty()
        ]),
                             dtype=np.long)

        peaks.CVG = chip_cvg
        peaks.SURL = left_cvg
        peaks.SURR = right_cvg

        peaks.drop_empty()

        peaks_with_info[peak_type] = peaks

    return peaks_with_info