def calc_windowed_seg_sites(self, chrom=0, L=1e3, filt_rec=True, mask=None): """Calculate windowed estimates of segregating sites. Arguments: * chrom: identifier for the chromosome * L: length of independent locus * filt_rec: filter recombination * mask: bed file for the underlying mask """ assert self.chrom_pos_dict is not None phys_pos = self.chrom_physpos_dict[chrom] rec_pos = self.chrom_pos_dict[chrom] weights = self.chrom_weight_dict[chrom] if filt_rec: diff = np.abs(rec_pos[:-1] - rec_pos[1:]) idx = np.where(diff != 0)[0] phys_pos = phys_pos[idx] rec_pos = rec_pos[idx] weights = weights[idx] if mask is not None: phys_pos = phys_pos.astype(np.float64) df_mask = pyranges.read_bed(mask) df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1)) cov_sites = df_pos.coverage(df_mask) sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32)) idx = np.where(sites_idx > 0.0)[0] phys_pos[idx] = np.nan # 1. Setup the bins for the analysis bins = np.arange(np.nanmin(phys_pos), np.nanmax(phys_pos), L) windowed_vars, bin_edges = np.histogram( phys_pos[~np.isnan(phys_pos)], bins=bins, weights=weights[~np.isnan(phys_pos)], ) bin_edges = bin_edges.astype(np.uint32) # Interpolate the midpoints of the recombination bins f = interpolate.interp1d(phys_pos, rec_pos) midpts = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2 rec_midpts = f(midpts) # Calculate the weightings from the mask as needed ... mask_weights = np.ones(rec_midpts.size) if mask is not None: # Mask must be a bedfile df_windows = PyRanges( chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:] ) df_mask = pyranges.read_bed(mask) cov = df_windows.coverage(df_mask) mask_weights = np.array(cov.FractionOverlaps.astype(np.float32)) # Set the mask weights to scale up the fraction that may be missing! mask_weights = 1.0 / (1.0 - mask_weights) mask_weights[np.isinf(mask_weights)] = np.nan # Stacking all of the data to make sure that we can use it later on tot_data = np.vstack([windowed_vars, bin_edges[1:], rec_midpts, mask_weights]) self.chrom_total_dict[chrom] = tot_data
def monte_carlo_corr_SA_SB_v2( self, L=1e3, dist=100, nreps=1000, chrom=0, seed=42, filt_rec=True, mask=None ): """Estimate the correlation using alternative Monte-Carlo Sampling. Key: this allows us to test much shorter length scales """ assert self.chrom_physpos_dict is not None assert self.chrom_pos_dict is not None assert L > 0 assert dist > 0 assert seed > 0 np.random.seed(seed) phys_pos = self.chrom_physpos_dict[chrom] rec_pos = self.chrom_pos_dict[chrom] weights = self.chrom_weight_dict[chrom] if filt_rec: diff = np.abs(rec_pos[:-1] - rec_pos[1:]) idx = np.where(diff != 0)[0] phys_pos = phys_pos[idx] rec_pos = rec_pos[idx] weights = weights[idx] if mask is not None: phys_pos = phys_pos.astype(np.float64) df_mask = pyranges.read_bed(mask) df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1)) cov_sites = df_pos.coverage(df_mask) sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32)) idx = np.where(sites_idx > 0.0)[0] phys_pos[idx] = np.nan # 1. Setup bins separated by some distance startp = np.nanmin(phys_pos) endp = startp + L windowed_vars = [] bins = [] while endp < np.nanmax(phys_pos): bins.append((startp, endp)) start = np.searchsorted(phys_pos[~np.isnan(phys_pos)], startp, "left") end = np.searchsorted(phys_pos[~np.isnan(phys_pos)], endp, "right") # Append this to actually weight the variants windowed_vars.append(end - start) startp += L + dist endp += L + dist windowed_vars = np.array(windowed_vars) bin_edges = np.array(bins).ravel() # print(bin_edges.size) # print(windowed_vars.size, bin_edges.size) assert (bin_edges.size / 2) == windowed_vars.size # Interpolate the midpoints of the recombination bins f = interpolate.interp1d(phys_pos, rec_pos) rec_dist = f(bin_edges[2:-1:2]) - f(bin_edges[1:-2:2]) # print(np.mean(rec_dist)) windowed_vars = windowed_vars[:-1] # print(rec_dist.size, windowed_vars.size) # Calculate the weightings from the mask as needed ... mask_weights = np.ones(windowed_vars.size) if mask is not None: # Mask must be a bedfile df_windows = PyRanges( chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:] ) df_mask = pyranges.read_bed(mask) cov = df_windows.coverage(df_mask) mask_weights = np.array(cov.FractionOverlaps.astype(np.float32)) # Set the mask weights to scale up the fraction that may be missing! mask_weights = 1.0 / (1.0 - mask_weights) mask_weights[np.isinf(mask_weights)] = np.nan # add to whatever total datatype that we require? windowed_het_weighted = mask_weights * windowed_vars # print(windowed_het_weighted.size) s1s = windowed_het_weighted[:-2:2] s2s = windowed_het_weighted[1:-1:2] # print(rec_dist.size, s1s.size, s2s.size) assert s1s.size == s2s.size # assert ((rec_dist.size / 2) - 1) == s1s.size # Perform the Monte-Carlo resampling here idx = np.random.randint(s1s.size, size=nreps) s1s_samp = s1s[idx] s2s_samp = s2s[idx] rec_dist_samp = rec_dist[2 * idx] if self.rec_dist is None: self.rec_dist = {} self.s1 = {} self.s2 = {} if chrom in self.rec_dist: tmp_rec_dist = np.append(self.rec_dist[chrom], rec_dist_samp) tmp_s1 = np.append(self.s1[chrom], s1s_samp) tmp_s2 = np.append(self.s2[chrom], s2s_samp) self.rec_dist[chrom] = tmp_rec_dist self.s1[chrom] = tmp_s1 self.s2[chrom] = tmp_s2 else: self.rec_dist[chrom] = rec_dist_samp self.s1[chrom] = s1s_samp self.s2[chrom] = s2s_samp
def compute_peaks_and_zscores(cvg, center, left, right, chip, background_sum, ratios, ratio, args): print("peaks and zscores") all_peaks, zs = _compute_peaks_and_zscores(cvg, center, left, right, chip, background_sum, ratios, ratio, args) print("peaks and zscores done") min_er = args["min_enrichment"] peaks_with_info = {} for peak_type, peaks in enumerate(all_peaks, 1): # print("find max start") # print(list(len(v) for v in zs[peak_type - 1].values())) # print(peaks) # print(zs[peak_type - 1].values()) # t1 = list(zs[peak_type - 1].values())[0] # print(t1) # print(max(t1[1])) max_zs = {} for k, v in zs[peak_type - 1].items(): max_zs[k] = np.array([max(v2[1]) for v2 in v]) # max_zs = np.array(max_zs) # print("find max end") # print("len max_zs:", sum(len(v) for v in max_zs.values())) result = {k: -(pnorm(v) / np.log(10)) for k, v in max_zs.items()} # print(len(peaks)) # print(len(np.concatenate([result[k] for k in natsorted(result)]))) peaks.NLP = np.around( np.concatenate([result[k] for k in natsorted(result)]), 3) peaks.Location = np.array(np.ceil((peaks.Start + peaks.End) / 2), dtype=np.long) peaks.Type = peak_type peaks_loc = PyRanges(seqnames=peaks.Chromosome, starts=peaks.Location, ends=peaks.Location + 1, strands=peaks.Strand) loc_cvg = peaks_loc.coverage() chip_cvg = loc_cvg * cvg bg_cvg = loc_cvg * background_sum peak_enrich_cvg_f = 1 + (ratio["+"] * chip_cvg["+"]) peak_enrich_cvg_r = 1 + (ratio["-"] * chip_cvg["-"]) peak_enrich_cvg = PyRles({ k: v for k, v in list(peak_enrich_cvg_r.items() + peak_enrich_cvg_f.items()) }) peak_enrich_ref = 1 + (bg_cvg) peak_enrich = peak_enrich_cvg / peak_enrich_ref vals_f = np.concatenate( [peak_enrich[k].values for k in peak_enrich["+"].keys()]) vals_r = np.concatenate( [peak_enrich[k].values for k in peak_enrich["-"].keys()]) vals_f = vals_f[np.isfinite(vals_f)] vals_r = vals_r[np.isfinite(vals_r)] # print(len(vals_f)) vals_f = vals_f[vals_f > 1] vals_r = vals_r[vals_r > 1] if peak_type == 1: min_er_f = np.percentile(vals_f, min_er * 100) min_er_r = np.percentile(vals_r, min_er * 100) vals_f = vals_f > min_er_f vals_r = vals_r > min_er_r # print(np.sum(vals_f)) # print(len(vals_f)) # print(peaks["+"]) peaks["+"].Enrichment = vals_f peaks["-"].Enrichment = vals_r peaks_loc["+"].Enrichment = vals_f peaks_loc["-"].Enrichment = vals_r peaks = peaks.apply( lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1)) peaks_loc = peaks_loc.apply( lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1)) peaks_loc.Start += 1 peaks_loc.End += 1 chip_cvg = np.array(np.concatenate([ cvg[k][peaks[k].Location] for k in cvg.keys() if not peaks[k].empty() ]), dtype=np.long) left_cvg = np.array(np.concatenate([ left[k][peaks[k].Location] for k in left.keys() if not peaks[k].empty() ]), dtype=np.long) right_cvg = np.array(np.concatenate([ right[k][peaks[k].Location] for k in right.keys() if not peaks[k].empty() ]), dtype=np.long) peaks.CVG = chip_cvg peaks.SURL = left_cvg peaks.SURR = right_cvg peaks.drop_empty() peaks_with_info[peak_type] = peaks return peaks_with_info