def count_overlaps(self, other, **kwargs): kwargs = fill_kwargs(kwargs) from pyranges.methods.coverage import _number_overlapping counts = pyrange_apply(_number_overlapping, self, other, **kwargs) return pr.PyRanges(counts)
def overlap(self, other, **kwargs): kwargs["sparse"] = {"self": False, "other": True} kwargs = fill_kwargs(kwargs) dfs = pyrange_apply(_overlap, self, other, **kwargs) return PyRanges(dfs)
def intersect(self, other, **kwargs): kwargs = fill_kwargs(kwargs) kwargs["sparse"] = {"self": False, "other": True} dfs = pyrange_apply(_intersection, self, other, **kwargs) return PyRanges(dfs)
def join(self, other, **kwargs): from pyranges.methods.join import _write_both kwargs = fill_kwargs(kwargs) dfs = pyrange_apply(_write_both, self, other, **kwargs) return PyRanges(dfs)
def insert(self, other, col, **kwargs): from pyranges.methods.insert import _insert kwargs["columns"] = col kwargs = fill_kwargs(kwargs) dfs = pyrange_apply(_insert, self, other, **kwargs) return PyRanges(dfs)
def nearest(self, other, **kwargs): from pyranges.methods.nearest import _nearest kwargs = fill_kwargs(kwargs) dfs = pyrange_apply(_nearest, self, other, **kwargs) return PyRanges(dfs)
def apply_pair(self, other, f, kwargs, strand=False, as_pyranges=True): f = ray.remote(f) result = pyrange_apply(f, self, other, **kwargs) if not as_pyranges: return result else: return PyRanges(result)
def nearest(self, other, **kwargs): from pyranges.methods.nearest import _nearest kwargs = fill_kwargs(kwargs) if kwargs.get("how") in "upstream downstream".split(): assert other.stranded, "If doing upstream or downstream nearest, other pyranges must be stranded" dfs = pyrange_apply(_nearest, self, other, **kwargs) return PyRanges(dfs)
def set_intersect(self, other, **kwargs): kwargs = fill_kwargs(kwargs) strandedness = kwargs["strandedness"] strand = True if strandedness else False self_clusters = self.merge(strand=strand, **kwargs) other_clusters = other.merge(strand=strand, **kwargs) dfs = pyrange_apply(_intersection, self_clusters, other_clusters, **kwargs) return PyRanges(dfs)
def overlap(self, other, **kwargs): kwargs["sparse"] = {"self": False, "other": True} kwargs["how"] = "first" kwargs = fill_kwargs(kwargs) dfs = pyrange_apply(_overlap, self, other, **kwargs) # if kwargs.get("return_indexes"): # return dfs # else: return pr.PyRanges(dfs)
def join(self, other, **kwargs): from pyranges.methods.join import _write_both slack = kwargs.get("slack") if slack: self.Start__slack = self.Start self.End__slack = self.End self = self.slack(slack) if "suffix" in kwargs: suffixes = "", kwargs["suffix"] kwargs["suffixes"] = suffixes kwargs = fill_kwargs(kwargs) if "new_pos" in kwargs: if kwargs["new_pos"] in "intersection union".split(): suffixes = kwargs.get("suffixes") assert suffixes is not None, "Must give two non-empty suffixes when using new_pos with intersection or union." assert suffixes[ 0], "Must have nonempty first suffix when using new_pos with intersection or union." assert suffixes[ 1], "Must have nonempty second suffix when using new_pos with intersection or union." # def get_items_dtypes(s): # columns = s.columns # dtypes = (s.dfs.values()) how = kwargs.get("how") if how in ["left", "outer"]: kwargs["example_header_other"] = other.head(1).df if how in ["right", "outer"]: kwargs["example_header_self"] = self.head(1).df dfs = pyrange_apply(_write_both, self, other, **kwargs) gr = PyRanges(dfs) if slack: gr.Start = gr.Start__slack gr.End = gr.End__slack gr = gr.drop(like="(Start|End).*__slack") new_position = kwargs.get("new_pos") if new_position: gr = gr.new_position(new_pos=new_position, suffixes=kwargs["suffixes"]) return gr
def subtract(self, other, **kwargs): from pyranges.methods.subtraction import _subtraction kwargs["sparse"] = {"self": False, "other": True} kwargs = fill_kwargs(kwargs) strandedness = kwargs["strandedness"] strand = True if strandedness else False other_clusters = other.merge(strand=strand, **kwargs) result = pyrange_apply(_subtraction, self, other_clusters, **kwargs) return PyRanges(result)
def apply_pair(self, other, f, kwargs=None, strand=False, as_pyranges=True): if kwargs is None: kwargs = {} kwargs = fill_kwargs(kwargs) f = ray.remote(f) result = pyrange_apply(f, self, other, **kwargs) if not as_pyranges: return result else: return PyRanges(result)
def apply_pair(self, other, f, strandedness=False, as_pyranges=True, **kwargs): kwargs.update({"strandedness": strandedness}) kwargs = fill_kwargs(kwargs) result = pyrange_apply(f, self, other, **kwargs) if not as_pyranges: return result else: return PyRanges(result)
def coverage(self, other, **kwargs): kwargs = fill_kwargs(kwargs) counts = self.count_overlaps(other, keep_nonoverlapping=True, **kwargs) strand = True if kwargs["strandedness"] else False other = other.merge(count=True, strand=strand) from pyranges.methods.coverage import _coverage # print(counts) counts = pr.PyRanges(pyrange_apply(_coverage, counts, other, **kwargs)) # print("counts" * 100) # print(counts) return counts
def join(self, other, **kwargs): from pyranges.methods.join import _write_both slack = kwargs.get("slack") if slack: self.Start__slack = self.Start self.End__slack = self.End self = self.slack(slack) if "suffix" in kwargs: suffixes = "", kwargs["suffix"] kwargs["suffixes"] = suffixes kwargs = fill_kwargs(kwargs) if "new_pos" in kwargs: if kwargs["new_pos"] in "intersection union".split(): suffixes = kwargs.get("suffixes") assert suffixes is not None, "Must give two non-empty suffixes when using new_pos with intersection or union." assert suffixes[ 0], "Must have nonempty first suffix when using new_pos with intersection or union." assert suffixes[ 1], "Must have nonempty second suffix when using new_pos with intersection or union." dfs = pyrange_apply(_write_both, self, other, **kwargs) gr = PyRanges(dfs) if slack: gr.Start = gr.Start__slack gr.End = gr.End__slack gr = gr.drop(like="(Start|End).*__slack") new_position = kwargs.get("new_pos") if new_position: gr = gr.new_position(new_pos=new_position, suffixes=kwargs["suffixes"]) return gr
def relative_distance(self, other, **kwargs): self = self.pr kwargs["sparse"] = {"self": True, "other": True} kwargs = pr.pyranges.fill_kwargs(kwargs) result = pyrange_apply(_relative_distance, self, other, **kwargs) # pylint: disable=E1132 result = pd.Series(np.concatenate(list(result.values()))) not_nan = ~np.isnan(result) result.loc[not_nan] = np.floor(result[not_nan] * 100) / 100 vc = result.value_counts(dropna=False).to_frame().reset_index() vc.columns = "reldist count".split() vc.insert(vc.shape[1], "total", len(result)) vc.insert(vc.shape[1], "fraction", vc["count"] / len(result)) vc = vc.sort_values("reldist", ascending=True) vc = vc.reset_index(drop=True) return vc
def introns(self, by="gene"): kwargs = {"by": by} kwargs = pr.pyranges.fill_kwargs(kwargs) assert by in ["gene", "transcript"] id_column = by_to_id[by] gr = self.pr.sort(id_column) if not len(gr): return pr.PyRanges() exons = gr.subset(lambda df: df.Feature == "exon") exons = exons.merge(by=id_column) by_gr = gr.subset(lambda df: df.Feature == by) result = pyrange_apply(_introns2, by_gr, exons, **kwargs) return pr.PyRanges(result)
def relative_distance(self, other): """Compute spatial correllation between two sets. Metric which describes relative distance between each interval in one set and two closest intervals in another. Parameters ---------- other : PyRanges Intervals to compare with. chromsizes : int, dict, DataFrame or PyRanges Integer representing genome length or mapping from chromosomes to its length. strandedness : {None, "same", "opposite", False}, default None, i.e. "auto" Whether to compute without regards to strand or on same or opposite. Returns ------- pandas.DataFrame DataFrame containing the frequency of each relative distance. See Also -------- pyranges.statistics.jaccard : compute the jaccard coefficient pyranges.statistics.forbes : compute the forbes coefficient Examples -------- >>> gr, gr2 = pr.data.chipseq(), pr.data.chipseq_background() >>> chromsizes = pr.data.chromsizes() >>> gr.stats.relative_distance(gr2) reldist count total fraction 0 0.00 264 9956 0.026517 1 0.01 226 9956 0.022700 2 0.02 206 9956 0.020691 3 0.03 235 9956 0.023604 4 0.04 194 9956 0.019486 5 0.05 241 9956 0.024207 6 0.06 201 9956 0.020189 7 0.07 191 9956 0.019184 8 0.08 192 9956 0.019285 9 0.09 191 9956 0.019184 10 0.10 186 9956 0.018682 11 0.11 203 9956 0.020390 12 0.12 218 9956 0.021896 13 0.13 209 9956 0.020992 14 0.14 201 9956 0.020189 15 0.15 178 9956 0.017879 16 0.16 202 9956 0.020289 17 0.17 197 9956 0.019787 18 0.18 208 9956 0.020892 19 0.19 202 9956 0.020289 20 0.20 191 9956 0.019184 21 0.21 188 9956 0.018883 22 0.22 213 9956 0.021394 23 0.23 192 9956 0.019285 24 0.24 199 9956 0.019988 25 0.25 181 9956 0.018180 26 0.26 172 9956 0.017276 27 0.27 191 9956 0.019184 28 0.28 190 9956 0.019084 29 0.29 192 9956 0.019285 30 0.30 201 9956 0.020189 31 0.31 212 9956 0.021294 32 0.32 213 9956 0.021394 33 0.33 177 9956 0.017778 34 0.34 197 9956 0.019787 35 0.35 163 9956 0.016372 36 0.36 191 9956 0.019184 37 0.37 198 9956 0.019888 38 0.38 160 9956 0.016071 39 0.39 188 9956 0.018883 40 0.40 200 9956 0.020088 41 0.41 188 9956 0.018883 42 0.42 230 9956 0.023102 43 0.43 197 9956 0.019787 44 0.44 224 9956 0.022499 45 0.45 184 9956 0.018481 46 0.46 198 9956 0.019888 47 0.47 187 9956 0.018783 48 0.48 200 9956 0.020088 49 0.49 194 9956 0.019486 """ self = self.pr kwargs = {} kwargs["sparse"] = {"self": True, "other": True} kwargs = pr.pyranges.fill_kwargs(kwargs) result = pyrange_apply(_relative_distance, self, other, **kwargs) # pylint: disable=E1132 result = pd.Series(np.concatenate(list(result.values()))) not_nan = ~np.isnan(result) result.loc[not_nan] = np.floor(result[not_nan] * 100) / 100 vc = result.value_counts(dropna=False).to_frame().reset_index() vc.columns = "reldist count".split() vc.insert(vc.shape[1], "total", len(result)) vc.insert(vc.shape[1], "fraction", vc["count"] / len(result)) vc = vc.sort_values("reldist", ascending=True) vc = vc.reset_index(drop=True) return vc
def introns(self, by="gene", nb_cpu=1): """Return the introns. Parameters ---------- by : str, {"gene", "transcript"}, default "gene" Whether to find introns per gene or transcript. nb_cpu: int, default 1 How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. Will only lead to speedups on large datasets. See Also -------- pyranges.genomicfeatures.GenomicFeaturesMethods.tss : return the transcription start sites Examples -------- >>> gr = pr.data.ensembl_gtf() >>> gr +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+ | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | gene_biotype | +19 | | (category) | (object) | (category) | (int32) | (int32) | (object) | (category) | (object) | (object) | ... | |--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------| | 1 | havana | gene | 11868 | 14409 | . | + | . | transcribed_unprocessed_pseudogene | ... | | 1 | havana | transcript | 11868 | 14409 | . | + | . | transcribed_unprocessed_pseudogene | ... | | 1 | havana | exon | 11868 | 12227 | . | + | . | transcribed_unprocessed_pseudogene | ... | | 1 | havana | exon | 12612 | 12721 | . | + | . | transcribed_unprocessed_pseudogene | ... | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | | 1 | havana | gene | 1173055 | 1179555 | . | - | . | lncRNA | ... | | 1 | havana | transcript | 1173055 | 1179555 | . | - | . | lncRNA | ... | | 1 | havana | exon | 1179364 | 1179555 | . | - | . | lncRNA | ... | | 1 | havana | exon | 1173055 | 1176396 | . | - | . | lncRNA | ... | +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+ Stranded PyRanges object has 2,446 rows and 28 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. 19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.) >>> gr.features.introns(by="gene") +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------+ | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | +20 | | (object) | (object) | (object) | (int32) | (int32) | (object) | (category) | (object) | ... | |--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------| | 1 | ensembl_havana | intron | 1173926 | 1174265 | . | + | . | ... | | 1 | ensembl_havana | intron | 1174321 | 1174423 | . | + | . | ... | | 1 | ensembl_havana | intron | 1174489 | 1174520 | . | + | . | ... | | 1 | ensembl_havana | intron | 1175034 | 1179188 | . | + | . | ... | | ... | ... | ... | ... | ... | ... | ... | ... | ... | | 1 | havana | intron | 874591 | 875046 | . | - | . | ... | | 1 | havana | intron | 875155 | 875525 | . | - | . | ... | | 1 | havana | intron | 875625 | 876526 | . | - | . | ... | | 1 | havana | intron | 876611 | 876754 | . | - | . | ... | +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------+ Stranded PyRanges object has 311 rows and 28 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. 20 hidden columns: gene_biotype, gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, ... (+ 11 more.) >>> gr.features.introns(by="transcript") +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------+ | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | gene_biotype | +19 | | (object) | (object) | (object) | (int32) | (int32) | (object) | (category) | (object) | (object) | ... | |--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------| | 1 | havana | intron | 818202 | 818722 | . | + | . | lncRNA | ... | | 1 | ensembl_havana | intron | 960800 | 961292 | . | + | . | protein_coding | ... | | 1 | ensembl_havana | intron | 961552 | 961628 | . | + | . | protein_coding | ... | | 1 | ensembl_havana | intron | 961750 | 961825 | . | + | . | protein_coding | ... | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | | 1 | havana | intron | 732207 | 732980 | . | - | . | transcribed_processed_pseudogene | ... | | 1 | havana_tagene | intron | 168165 | 169048 | . | - | . | lncRNA | ... | | 1 | havana_tagene | intron | 165942 | 167958 | . | - | . | lncRNA | ... | | 1 | havana_tagene | intron | 168165 | 169048 | . | - | . | lncRNA | ... | +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------+ Stranded PyRanges object has 1,043 rows and 28 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. 19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.) """ kwargs = {"by": by, "nb_cpu": nb_cpu} kwargs = pr.pyranges.fill_kwargs(kwargs) assert by in ["gene", "transcript"] id_column = by_to_id[by] gr = self.pr.sort(id_column) if not len(gr): return pr.PyRanges() exons = gr.subset(lambda df: df.Feature == "exon") exons = exons.merge(by=id_column) by_gr = gr.subset(lambda df: df.Feature == by) result = pyrange_apply(_introns2, by_gr, exons, **kwargs) return pr.PyRanges(result)
def k_nearest(self, other, k=1, **kwargs): from pyranges.methods.k_nearest import _nearest from sorted_nearest import get_all_ties, get_different_ties kwargs = fill_kwargs(kwargs) kwargs["stranded"] = self.stranded and other.stranded overlap = kwargs.get("overlap", True) ties = kwargs.get("ties", False) self = pr.PyRanges({k: v.copy() for k, v in self.dfs.items()}) try: # if k is an array k = k.values except: pass self.__k__ = k self.__IX__ = np.arange(len(self)) # from time import time # start = time() dfs = pyrange_apply(_nearest, self, other, **kwargs) # end = time() # print("nearest", end - start) nearest = PyRanges(dfs) # nearest.msp() # raise # print("nearest len", len(nearest)) if not overlap: # self = self.drop(like="__k__|__IX__") result = nearest#.drop(like="__k__|__IX__") else: from collections import defaultdict overlap_kwargs = {k: v for k, v in kwargs.items()} # print("kwargs ties:", kwargs.get("ties")) overlap_kwargs["how"] = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")] # start = time() overlaps = self.join(other, **overlap_kwargs) # end = time() # print("overlaps", end - start) overlaps.Distance = 0 # print("overlaps len", len(overlaps)) result = pr.concat([overlaps, nearest]) if not len(result): return pr.PyRanges() # print(result) # print(overlaps.drop(like="__").df) # raise # start = time() new_result = {} if ties in ["first", "last"]: # method = "tail" if ties == "last" else "head" # keep = "last" if ties == "last" else "first" for c, df in result: # start = time() # print(c) # print(df) df = df.sort_values(["__IX__", "Distance"]) grpby = df.groupby("__k__", sort=False) dfs = [] for k, kdf in grpby: # print("k", k) # print(kdf) # dist_bool = ~kdf.Distance.duplicated(keep=keep) # print(dist_bool) # kdf = kdf[dist_bool] grpby2 = kdf.groupby("__IX__", sort=False) # f = getattr(grpby2, method) _df = grpby2.head(k) # print(_df) dfs.append(_df) # raise if dfs: new_result[c] = pd.concat(dfs) # print(new_result[c]) elif ties == "different" or not ties: for c, df in result: # print(df) if df.empty: continue dfs = [] df = df.sort_values(["__IX__", "Distance"]) grpby = df.groupby("__k__", sort=False) # for each index # want to keep until we have k # then keep all with same distance for k, kdf in grpby: # print("kdf " * 10) # print("k " * 5, k) # print(kdf["__IX__ Distance".split()]) # print(kdf.dtypes) # print(kdf.index.dtypes) # if ties: if ties: lx = get_different_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) else: lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) # print(lx) # else: # lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) _df = kdf.reindex(lx) # print("_df", _df) dfs.append(_df) if dfs: new_result[c] = pd.concat(dfs) result = pr.PyRanges(new_result) if not result.__IX__.is_monotonic: result = result.sort("__IX__") result = result.drop(like="__IX__|__k__") self = self.drop(like="__k__|__IX__") def prev_to_neg(df, kwargs): strand = df.Strand.iloc[0] if "Strand" in df else "+" suffix = kwargs["suffix"] bools = df["End" + suffix] < df.Start if not strand == "+": bools = ~bools df.loc[bools, "Distance"] = -df.loc[bools, "Distance"] return df # print(result) result = result.apply(prev_to_neg, suffix=kwargs["suffix"]) # print(result) # end = time() # print("final stuff", end - start) return result