def cluster_PETs(args): # reading the file logging.info(f"Reading PETs from {args.pets_filename} ...") columns = ["chrom1", "start1", "end1", "chrom2", "start2", "end2", "cnt"] pets = pd.concat([pd.read_csv(f, sep="\t", header=None, names=columns, low_memory=False, nrows=args.nrows) for f in args.pets_filename]) logging.info(f"Read {len(pets):,} PETs.") # pre-proccess logging.info(f"Preprocessing (Exstension: {args.extension}bp, Self-ligation genomic span: {args.self_ligation}bp, " f"PET cutoff: {args.pet_cutoff}) ...") # check the data integrity invalid_pets = pets[(pets.chrom1 != pets.chrom2) | (pets.start1 > pets.end1) | (pets.start2 > pets.end2)] if len(invalid_pets) > 0: logging.info(f"{len(invalid_pets)} inter-chromosomal or misordered PETs are ignored:") logging.info(invalid_pets.head()) pets = pets[(pets.chrom1 == pets.chrom2) & (pets.start1 <= pets.end1) & (pets.start2 <= pets.end2)] # keep only non-self-ligating PETs pets = pets[pets.start2 - pets.end1 >= args.self_ligation] # keep PETs with count >= PET cutoff pets = pets[pets.cnt >= args.pet_cutoff] # add extension pets.start1 = (pets.start1 - args.extension).clip(0) pets.end1 = pets.end1 + args.extension pets.start2 = (pets.start2 - args.extension).clip(0) pets.end2 = pets.end2 + args.extension # remove not intersecting anchors peaks = None if args.peaks_filename: peaks = pd.read_csv(args.peaks_filename, sep="\t", header=None, usecols=[0, 1, 2, 6], names=["Chromosome", "Start", "End", "Score"]) peaks = PyRanges(peaks) pets["Chromosome"], pets["Start"], pets["End"] = pets.chrom1, pets.start1, pets.end1 pets = PyRanges(pets).intersect(peaks).df pets["Chromosome"], pets["Start"], pets["End"] = pets.chrom2, pets.start2, pets.end2 pets = PyRanges(pets).intersect(peaks).df peaks = peaks.df peaks["Center"] = peaks["Start"] + (peaks["End"]-peaks["Start"]) // 2 logging.info(f"Done. {len(pets):,} PETs left.") chroms = pets.groupby(["chrom1"]).size().to_dict() start1s = numba.typed.List([pets[pets.chrom1 == chrom].start1.to_numpy() for chrom in chroms.keys()]) end1s = numba.typed.List([pets[pets.chrom1 == chrom].end1.to_numpy() for chrom in chroms.keys()]) start2s = numba.typed.List([pets[pets.chrom1 == chrom].start2.to_numpy() for chrom in chroms.keys()]) end2s = numba.typed.List([pets[pets.chrom1 == chrom].end2.to_numpy() for chrom in chroms.keys()]) cnts = numba.typed.List([pets[pets.chrom1 == chrom].cnt.to_numpy() for chrom in chroms.keys()]) step = 0 changes = np.ones(shape=(len(chroms),)) while np.sum(changes) > 0: # sorting# if changes[i] > 0 else None\ logging.info(f"Sorting (step: #{step+1}, PETs: {len(pets):,}) ...") orders = numba.typed.List([np.lexsort((end2s[i], start2s[i], end1s[i], start1s[i])) for i in range(len(chroms))]) logging.info("Done.") # clustering logging.info(f"Clustering (step: #{step+1}, PETs: {len(pets):,}) ...") @numba.jit(nopython=True, parallel=True) def cluster(chroms, old_changes, orders, start1s, end1s, start2s, end2s, cnts): changes = np.zeros(shape=(len(chroms)-1,), dtype=np.uint64) for idx in numba.prange(len(chroms)-1): if old_changes[idx] > 0: order, start1, end1, start2, end2, cnt =\ orders[idx], start1s[idx], end1s[idx], start2s[idx], end2s[idx], cnts[idx] for _i in range(chroms[idx]): i = order[_i] if cnt[i] == 0: continue _j = _i + 1 while _j < chroms[idx]: j = order[_j] if start1[j] > end1[i]: break if cnt[j] == 0: _j += 1 continue if ((start1[i] <= start1[j] and start1[j] <= end1[i]) or (start1[i] <= end1[j] and end1[j] <= end1[i])) and\ ((start2[i] <= start2[j] and start2[j] <= end2[i]) or (start2[i] <= end2[j] and end2[j] <= end2[i])): start1[i] = min(start1[i], start1[j]) end1[i] = max(end1[i], end1[j]) start2[i] = min(start2[i], start2[j]) end2[i] = max(end2[i], end2[j]) cnt[i] += cnt[j] cnt[j] = 0 changes[idx] += 1 _j += 1 return changes changes = cluster(numba.typed.List(chroms.values()), changes, orders, start1s, end1s, start2s, end2s, cnts) logging.info(f"Done. Changes: {int(sum(changes)):,}") step += 1 # save to file logging.info(f"Saving to {args.clusters_filename} (cluster cufoff: {args.cluster_cutoff})... ") pets = pd.DataFrame() for i, (chrom, size) in enumerate(chroms.items()): pets = pd.concat([ pets, pd.DataFrame(data={"chrom1": itertools.repeat(chrom, size), "start1": start1s[i][orders[i]], "end1": end1s[i][orders[i]], "chrom2": itertools.repeat(chrom, size), "start2": start2s[i][orders[i]], "end2": end2s[i][orders[i]], "cnt": cnts[i][orders[i]]})]) pets = pets[pets.cnt >= args.cluster_cutoff] if peaks is not None: pets["Center1"] = pets.apply(lambda row: peaks.iloc[ peaks[(peaks.Chromosome == row.chrom1) & (peaks.Start <= row.end1) & (row.start1 <= peaks.End)] ["Score"].idxmax()]["Center"], axis=1) pets["Center2"] = pets.apply(lambda row: peaks.iloc[ peaks[(peaks.Chromosome == row.chrom1) & (peaks.Start <= row.end2) & (row.start2 <= peaks.End)] ["Score"].idxmax()]["Center"], axis=1) pets.to_csv(args.clusters_filename, sep="\t", index=False, header=False) logging.info(f"Done. Saved {len(pets):,} clusters.") return pets
def compute_peaks_and_zscores(cvg, center, left, right, chip, background_sum, ratios, ratio, args): print("peaks and zscores") all_peaks, zs = _compute_peaks_and_zscores(cvg, center, left, right, chip, background_sum, ratios, ratio, args) print("peaks and zscores done") min_er = args["min_enrichment"] peaks_with_info = {} for peak_type, peaks in enumerate(all_peaks, 1): # print("find max start") # print(list(len(v) for v in zs[peak_type - 1].values())) # print(peaks) # print(zs[peak_type - 1].values()) # t1 = list(zs[peak_type - 1].values())[0] # print(t1) # print(max(t1[1])) max_zs = {} for k, v in zs[peak_type - 1].items(): max_zs[k] = np.array([max(v2[1]) for v2 in v]) # max_zs = np.array(max_zs) # print("find max end") # print("len max_zs:", sum(len(v) for v in max_zs.values())) result = {k: -(pnorm(v) / np.log(10)) for k, v in max_zs.items()} # print(len(peaks)) # print(len(np.concatenate([result[k] for k in natsorted(result)]))) peaks.NLP = np.around( np.concatenate([result[k] for k in natsorted(result)]), 3) peaks.Location = np.array(np.ceil((peaks.Start + peaks.End) / 2), dtype=np.long) peaks.Type = peak_type peaks_loc = PyRanges(seqnames=peaks.Chromosome, starts=peaks.Location, ends=peaks.Location + 1, strands=peaks.Strand) loc_cvg = peaks_loc.coverage() chip_cvg = loc_cvg * cvg bg_cvg = loc_cvg * background_sum peak_enrich_cvg_f = 1 + (ratio["+"] * chip_cvg["+"]) peak_enrich_cvg_r = 1 + (ratio["-"] * chip_cvg["-"]) peak_enrich_cvg = PyRles({ k: v for k, v in list(peak_enrich_cvg_r.items() + peak_enrich_cvg_f.items()) }) peak_enrich_ref = 1 + (bg_cvg) peak_enrich = peak_enrich_cvg / peak_enrich_ref vals_f = np.concatenate( [peak_enrich[k].values for k in peak_enrich["+"].keys()]) vals_r = np.concatenate( [peak_enrich[k].values for k in peak_enrich["-"].keys()]) vals_f = vals_f[np.isfinite(vals_f)] vals_r = vals_r[np.isfinite(vals_r)] # print(len(vals_f)) vals_f = vals_f[vals_f > 1] vals_r = vals_r[vals_r > 1] if peak_type == 1: min_er_f = np.percentile(vals_f, min_er * 100) min_er_r = np.percentile(vals_r, min_er * 100) vals_f = vals_f > min_er_f vals_r = vals_r > min_er_r # print(np.sum(vals_f)) # print(len(vals_f)) # print(peaks["+"]) peaks["+"].Enrichment = vals_f peaks["-"].Enrichment = vals_r peaks_loc["+"].Enrichment = vals_f peaks_loc["-"].Enrichment = vals_r peaks = peaks.apply( lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1)) peaks_loc = peaks_loc.apply( lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1)) peaks_loc.Start += 1 peaks_loc.End += 1 chip_cvg = np.array(np.concatenate([ cvg[k][peaks[k].Location] for k in cvg.keys() if not peaks[k].empty() ]), dtype=np.long) left_cvg = np.array(np.concatenate([ left[k][peaks[k].Location] for k in left.keys() if not peaks[k].empty() ]), dtype=np.long) right_cvg = np.array(np.concatenate([ right[k][peaks[k].Location] for k in right.keys() if not peaks[k].empty() ]), dtype=np.long) peaks.CVG = chip_cvg peaks.SURL = left_cvg peaks.SURR = right_cvg peaks.drop_empty() peaks_with_info[peak_type] = peaks return peaks_with_info