Python PyRanges.apply Examples

Programming Language: Python

Namespace/Package Name: pyranges

Class/Type: PyRanges

Method/Function: apply

Examples at hotexamples.com: 2

Python PyRanges.apply - 2 examples found. These are the top rated real world Python examples of pyranges.PyRanges.apply extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PyRanges(30)

join(5)

coverage(3)

Length(2)

nearest(2)

lengths(2)

apply(2)

intersection(2)

df(2)

groupby(1)

set_intersection(1)

overlap(1)

is_const(1)

intersect(1)

CVG(1)

drop_empty(1)

Form(1)

count_overlaps(1)

as_df(1)

SURR(1)

SURL(1)

NLP(1)

MAX_NLP(1)

Location(1)

to_csv(1)

Example #1

Show file

File: cluster_PETs.py Project: cellular-genomics/cluster-paired-end-tags

def cluster_PETs(args):
    # reading the file
    logging.info(f"Reading PETs from {args.pets_filename} ...")
    columns = ["chrom1", "start1", "end1", "chrom2", "start2", "end2", "cnt"]
    pets = pd.concat([pd.read_csv(f, sep="\t", header=None, names=columns, low_memory=False, nrows=args.nrows) for f in args.pets_filename])
    logging.info(f"Read {len(pets):,} PETs.")

    # pre-proccess
    logging.info(f"Preprocessing (Exstension: {args.extension}bp, Self-ligation genomic span: {args.self_ligation}bp, "
                 f"PET cutoff: {args.pet_cutoff}) ...")

    # check the data integrity
    invalid_pets = pets[(pets.chrom1 != pets.chrom2) | (pets.start1 > pets.end1) | (pets.start2 > pets.end2)]
    if len(invalid_pets) > 0:
        logging.info(f"{len(invalid_pets)} inter-chromosomal or misordered PETs are ignored:")
        logging.info(invalid_pets.head())
    pets = pets[(pets.chrom1 == pets.chrom2) & (pets.start1 <= pets.end1) & (pets.start2 <= pets.end2)]

    # keep only non-self-ligating PETs
    pets = pets[pets.start2 - pets.end1 >= args.self_ligation]

    # keep PETs with count >= PET cutoff
    pets = pets[pets.cnt >= args.pet_cutoff]

    # add extension
    pets.start1 = (pets.start1 - args.extension).clip(0)
    pets.end1 = pets.end1 + args.extension
    pets.start2 = (pets.start2 - args.extension).clip(0)
    pets.end2 = pets.end2 + args.extension

    # remove not intersecting anchors
    peaks = None
    if args.peaks_filename:
        peaks = pd.read_csv(args.peaks_filename, sep="\t", header=None, usecols=[0, 1, 2, 6],
                            names=["Chromosome", "Start", "End", "Score"])
        peaks = PyRanges(peaks)
        pets["Chromosome"], pets["Start"], pets["End"] = pets.chrom1, pets.start1, pets.end1
        pets = PyRanges(pets).intersect(peaks).df
        pets["Chromosome"], pets["Start"], pets["End"] = pets.chrom2, pets.start2, pets.end2
        pets = PyRanges(pets).intersect(peaks).df
        peaks = peaks.df
        peaks["Center"] = peaks["Start"] + (peaks["End"]-peaks["Start"]) // 2

    logging.info(f"Done. {len(pets):,} PETs left.")

    chroms = pets.groupby(["chrom1"]).size().to_dict()

    start1s = numba.typed.List([pets[pets.chrom1 == chrom].start1.to_numpy() for chrom in chroms.keys()])
    end1s = numba.typed.List([pets[pets.chrom1 == chrom].end1.to_numpy() for chrom in chroms.keys()])
    start2s = numba.typed.List([pets[pets.chrom1 == chrom].start2.to_numpy() for chrom in chroms.keys()])
    end2s = numba.typed.List([pets[pets.chrom1 == chrom].end2.to_numpy() for chrom in chroms.keys()])
    cnts = numba.typed.List([pets[pets.chrom1 == chrom].cnt.to_numpy() for chrom in chroms.keys()])

    step = 0
    changes = np.ones(shape=(len(chroms),))
    while np.sum(changes) > 0:
        # sorting# if changes[i] > 0 else None\
        logging.info(f"Sorting (step: #{step+1}, PETs: {len(pets):,}) ...")
        orders = numba.typed.List([np.lexsort((end2s[i], start2s[i], end1s[i], start1s[i]))
                                   for i in range(len(chroms))])
        logging.info("Done.")

        # clustering
        logging.info(f"Clustering (step: #{step+1}, PETs: {len(pets):,}) ...")

        @numba.jit(nopython=True, parallel=True)
        def cluster(chroms, old_changes, orders, start1s, end1s, start2s, end2s, cnts):
            changes = np.zeros(shape=(len(chroms)-1,), dtype=np.uint64)

            for idx in numba.prange(len(chroms)-1):
                if old_changes[idx] > 0:
                    order, start1, end1, start2, end2, cnt =\
                        orders[idx], start1s[idx], end1s[idx], start2s[idx], end2s[idx], cnts[idx]
                    for _i in range(chroms[idx]):
                        i = order[_i]
                        if cnt[i] == 0:
                            continue
                        _j = _i + 1
                        while _j < chroms[idx]:
                            j = order[_j]
                            if start1[j] > end1[i]:
                                break
                            if cnt[j] == 0:
                                _j += 1
                                continue
                            if ((start1[i] <= start1[j] and start1[j] <= end1[i]) or (start1[i] <= end1[j] and end1[j] <= end1[i])) and\
                               ((start2[i] <= start2[j] and start2[j] <= end2[i]) or (start2[i] <= end2[j] and end2[j] <= end2[i])):
                                start1[i] = min(start1[i], start1[j])
                                end1[i] = max(end1[i], end1[j])
                                start2[i] = min(start2[i], start2[j])
                                end2[i] = max(end2[i], end2[j])
                                cnt[i] += cnt[j]
                                cnt[j] = 0
                                changes[idx] += 1
                            _j += 1
            return changes
        changes = cluster(numba.typed.List(chroms.values()), changes, orders, start1s, end1s, start2s, end2s, cnts)
        logging.info(f"Done. Changes: {int(sum(changes)):,}")
        step += 1

    # save to file
    logging.info(f"Saving to {args.clusters_filename} (cluster cufoff: {args.cluster_cutoff})... ")
    pets = pd.DataFrame()
    for i, (chrom, size) in enumerate(chroms.items()):
        pets = pd.concat([
            pets, pd.DataFrame(data={"chrom1": itertools.repeat(chrom, size),
                                     "start1": start1s[i][orders[i]],
                                     "end1": end1s[i][orders[i]],
                                     "chrom2": itertools.repeat(chrom, size),
                                     "start2": start2s[i][orders[i]],
                                     "end2": end2s[i][orders[i]],
                                     "cnt": cnts[i][orders[i]]})])
    pets = pets[pets.cnt >= args.cluster_cutoff]
    if peaks is not None:
        pets["Center1"] = pets.apply(lambda row: peaks.iloc[
                            peaks[(peaks.Chromosome == row.chrom1) &
                                  (peaks.Start <= row.end1) & (row.start1 <= peaks.End)]
                                 ["Score"].idxmax()]["Center"], axis=1)
        pets["Center2"] = pets.apply(lambda row: peaks.iloc[
                            peaks[(peaks.Chromosome == row.chrom1) &
                                  (peaks.Start <= row.end2) & (row.start2 <= peaks.End)]
                                 ["Score"].idxmax()]["Center"], axis=1)
    pets.to_csv(args.clusters_filename, sep="\t", index=False, header=False)
    logging.info(f"Done. Saved {len(pets):,} clusters.")

    return pets

Example #2

Show file

def compute_peaks_and_zscores(cvg, center, left, right, chip, background_sum,
                              ratios, ratio, args):

    print("peaks and zscores")
    all_peaks, zs = _compute_peaks_and_zscores(cvg, center, left, right, chip,
                                               background_sum, ratios, ratio,
                                               args)
    print("peaks and zscores done")

    min_er = args["min_enrichment"]

    peaks_with_info = {}
    for peak_type, peaks in enumerate(all_peaks, 1):

        # print("find max start")
        # print(list(len(v) for v in zs[peak_type - 1].values()))
        # print(peaks)
        # print(zs[peak_type - 1].values())
        # t1 = list(zs[peak_type - 1].values())[0]
        # print(t1)
        # print(max(t1[1]))
        max_zs = {}
        for k, v in zs[peak_type - 1].items():
            max_zs[k] = np.array([max(v2[1]) for v2 in v])

        # max_zs = np.array(max_zs)
        # print("find max end")

        # print("len max_zs:", sum(len(v) for v in max_zs.values()))

        result = {k: -(pnorm(v) / np.log(10)) for k, v in max_zs.items()}
        # print(len(peaks))
        # print(len(np.concatenate([result[k] for k in natsorted(result)])))
        peaks.NLP = np.around(
            np.concatenate([result[k] for k in natsorted(result)]), 3)

        peaks.Location = np.array(np.ceil((peaks.Start + peaks.End) / 2),
                                  dtype=np.long)

        peaks.Type = peak_type

        peaks_loc = PyRanges(seqnames=peaks.Chromosome,
                             starts=peaks.Location,
                             ends=peaks.Location + 1,
                             strands=peaks.Strand)
        loc_cvg = peaks_loc.coverage()

        chip_cvg = loc_cvg * cvg
        bg_cvg = loc_cvg * background_sum

        peak_enrich_cvg_f = 1 + (ratio["+"] * chip_cvg["+"])
        peak_enrich_cvg_r = 1 + (ratio["-"] * chip_cvg["-"])
        peak_enrich_cvg = PyRles({
            k: v
            for k, v in list(peak_enrich_cvg_r.items() +
                             peak_enrich_cvg_f.items())
        })

        peak_enrich_ref = 1 + (bg_cvg)
        peak_enrich = peak_enrich_cvg / peak_enrich_ref

        vals_f = np.concatenate(
            [peak_enrich[k].values for k in peak_enrich["+"].keys()])
        vals_r = np.concatenate(
            [peak_enrich[k].values for k in peak_enrich["-"].keys()])
        vals_f = vals_f[np.isfinite(vals_f)]
        vals_r = vals_r[np.isfinite(vals_r)]

        # print(len(vals_f))
        vals_f = vals_f[vals_f > 1]
        vals_r = vals_r[vals_r > 1]

        if peak_type == 1:
            min_er_f = np.percentile(vals_f, min_er * 100)
            min_er_r = np.percentile(vals_r, min_er * 100)

        vals_f = vals_f > min_er_f
        vals_r = vals_r > min_er_r

        # print(np.sum(vals_f))
        # print(len(vals_f))
        # print(peaks["+"])

        peaks["+"].Enrichment = vals_f
        peaks["-"].Enrichment = vals_r

        peaks_loc["+"].Enrichment = vals_f
        peaks_loc["-"].Enrichment = vals_r

        peaks = peaks.apply(
            lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1))
        peaks_loc = peaks_loc.apply(
            lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1))
        peaks_loc.Start += 1
        peaks_loc.End += 1

        chip_cvg = np.array(np.concatenate([
            cvg[k][peaks[k].Location] for k in cvg.keys()
            if not peaks[k].empty()
        ]),
                            dtype=np.long)
        left_cvg = np.array(np.concatenate([
            left[k][peaks[k].Location] for k in left.keys()
            if not peaks[k].empty()
        ]),
                            dtype=np.long)
        right_cvg = np.array(np.concatenate([
            right[k][peaks[k].Location] for k in right.keys()
            if not peaks[k].empty()
        ]),
                             dtype=np.long)

        peaks.CVG = chip_cvg
        peaks.SURL = left_cvg
        peaks.SURR = right_cvg

        peaks.drop_empty()

        peaks_with_info[peak_type] = peaks

    return peaks_with_info