Example #1
0
    def count_overlaps(self, other, **kwargs):

        kwargs = fill_kwargs(kwargs)

        from pyranges.methods.coverage import _number_overlapping
        counts = pyrange_apply(_number_overlapping, self, other, **kwargs)

        return pr.PyRanges(counts)
Example #2
0
    def overlap(self, other, **kwargs):

        kwargs["sparse"] = {"self": False, "other": True}
        kwargs = fill_kwargs(kwargs)

        dfs = pyrange_apply(_overlap, self, other, **kwargs)

        return PyRanges(dfs)
Example #3
0
    def intersect(self, other, **kwargs):

        kwargs = fill_kwargs(kwargs)
        kwargs["sparse"] = {"self": False, "other": True}

        dfs = pyrange_apply(_intersection, self, other, **kwargs)

        return PyRanges(dfs)
Example #4
0
    def join(self, other, **kwargs):

        from pyranges.methods.join import _write_both

        kwargs = fill_kwargs(kwargs)
        dfs = pyrange_apply(_write_both, self, other, **kwargs)

        return PyRanges(dfs)
Example #5
0
    def insert(self, other, col, **kwargs):

        from pyranges.methods.insert import _insert

        kwargs["columns"] = col
        kwargs = fill_kwargs(kwargs)
        dfs = pyrange_apply(_insert, self, other, **kwargs)

        return PyRanges(dfs)
Example #6
0
    def nearest(self, other, **kwargs):

        from pyranges.methods.nearest import _nearest

        kwargs = fill_kwargs(kwargs)

        dfs = pyrange_apply(_nearest, self, other, **kwargs)

        return PyRanges(dfs)
Example #7
0
    def apply_pair(self, other, f, kwargs, strand=False, as_pyranges=True):

        f = ray.remote(f)

        result = pyrange_apply(f, self, other, **kwargs)

        if not as_pyranges:
            return result
        else:
            return PyRanges(result)
Example #8
0
    def nearest(self, other, **kwargs):

        from pyranges.methods.nearest import _nearest

        kwargs = fill_kwargs(kwargs)
        if kwargs.get("how") in "upstream downstream".split():
            assert other.stranded, "If doing upstream or downstream nearest, other pyranges must be stranded"

        dfs = pyrange_apply(_nearest, self, other, **kwargs)

        return PyRanges(dfs)
Example #9
0
    def set_intersect(self, other, **kwargs):

        kwargs = fill_kwargs(kwargs)
        strandedness = kwargs["strandedness"]
        strand = True if strandedness else False
        self_clusters = self.merge(strand=strand, **kwargs)
        other_clusters = other.merge(strand=strand, **kwargs)
        dfs = pyrange_apply(_intersection, self_clusters, other_clusters,
                            **kwargs)

        return PyRanges(dfs)
Example #10
0
    def overlap(self, other, **kwargs):

        kwargs["sparse"] = {"self": False, "other": True}
        kwargs["how"] = "first"
        kwargs = fill_kwargs(kwargs)

        dfs = pyrange_apply(_overlap, self, other, **kwargs)

        # if kwargs.get("return_indexes"):
        #     return dfs
        # else:
        return pr.PyRanges(dfs)
Example #11
0
    def join(self, other, **kwargs):

        from pyranges.methods.join import _write_both

        slack = kwargs.get("slack")
        if slack:
            self.Start__slack = self.Start
            self.End__slack = self.End

            self = self.slack(slack)

        if "suffix" in kwargs:
            suffixes = "", kwargs["suffix"]
            kwargs["suffixes"] = suffixes

        kwargs = fill_kwargs(kwargs)

        if "new_pos" in kwargs:
            if kwargs["new_pos"] in "intersection union".split():
                suffixes = kwargs.get("suffixes")
                assert suffixes is not None, "Must give two non-empty suffixes when using new_pos with intersection or union."
                assert suffixes[
                    0], "Must have nonempty first suffix when using new_pos with intersection or union."
                assert suffixes[
                    1], "Must have nonempty second suffix when using new_pos with intersection or union."

        # def get_items_dtypes(s):

        #     columns = s.columns
        #     dtypes = (s.dfs.values())
        how = kwargs.get("how")

        if how in ["left", "outer"]:
            kwargs["example_header_other"] = other.head(1).df
        if how in ["right", "outer"]:
            kwargs["example_header_self"] = self.head(1).df

        dfs = pyrange_apply(_write_both, self, other, **kwargs)

        gr = PyRanges(dfs)

        if slack:
            gr.Start = gr.Start__slack
            gr.End = gr.End__slack
            gr = gr.drop(like="(Start|End).*__slack")

        new_position = kwargs.get("new_pos")
        if new_position:
            gr = gr.new_position(new_pos=new_position,
                                 suffixes=kwargs["suffixes"])

        return gr
Example #12
0
    def subtract(self, other, **kwargs):

        from pyranges.methods.subtraction import _subtraction

        kwargs["sparse"] = {"self": False, "other": True}
        kwargs = fill_kwargs(kwargs)
        strandedness = kwargs["strandedness"]

        strand = True if strandedness else False
        other_clusters = other.merge(strand=strand, **kwargs)
        result = pyrange_apply(_subtraction, self, other_clusters, **kwargs)

        return PyRanges(result)
Example #13
0
    def apply_pair(self, other, f, kwargs=None, strand=False,
                   as_pyranges=True):

        if kwargs is None:
            kwargs = {}
        kwargs = fill_kwargs(kwargs)

        f = ray.remote(f)

        result = pyrange_apply(f, self, other, **kwargs)

        if not as_pyranges:
            return result
        else:
            return PyRanges(result)
Example #14
0
    def apply_pair(self,
                   other,
                   f,
                   strandedness=False,
                   as_pyranges=True,
                   **kwargs):

        kwargs.update({"strandedness": strandedness})
        kwargs = fill_kwargs(kwargs)

        result = pyrange_apply(f, self, other, **kwargs)

        if not as_pyranges:
            return result
        else:
            return PyRanges(result)
Example #15
0
    def coverage(self, other, **kwargs):

        kwargs = fill_kwargs(kwargs)

        counts = self.count_overlaps(other, keep_nonoverlapping=True, **kwargs)

        strand = True if kwargs["strandedness"] else False
        other = other.merge(count=True, strand=strand)

        from pyranges.methods.coverage import _coverage

        # print(counts)
        counts = pr.PyRanges(pyrange_apply(_coverage, counts, other, **kwargs))
        # print("counts" * 100)
        # print(counts)

        return counts
Example #16
0
    def join(self, other, **kwargs):

        from pyranges.methods.join import _write_both

        slack = kwargs.get("slack")
        if slack:
            self.Start__slack = self.Start
            self.End__slack = self.End

            self = self.slack(slack)

        if "suffix" in kwargs:
            suffixes = "", kwargs["suffix"]
            kwargs["suffixes"] = suffixes

        kwargs = fill_kwargs(kwargs)

        if "new_pos" in kwargs:
            if kwargs["new_pos"] in "intersection union".split():
                suffixes = kwargs.get("suffixes")
                assert suffixes is not None, "Must give two non-empty suffixes when using new_pos with intersection or union."
                assert suffixes[
                    0], "Must have nonempty first suffix when using new_pos with intersection or union."
                assert suffixes[
                    1], "Must have nonempty second suffix when using new_pos with intersection or union."

        dfs = pyrange_apply(_write_both, self, other, **kwargs)

        gr = PyRanges(dfs)

        if slack:
            gr.Start = gr.Start__slack
            gr.End = gr.End__slack
            gr = gr.drop(like="(Start|End).*__slack")

        new_position = kwargs.get("new_pos")
        if new_position:
            gr = gr.new_position(new_pos=new_position,
                                 suffixes=kwargs["suffixes"])

        return gr
Example #17
0
    def relative_distance(self, other, **kwargs):

        self = self.pr

        kwargs["sparse"] = {"self": True, "other": True}
        kwargs = pr.pyranges.fill_kwargs(kwargs)

        result = pyrange_apply(_relative_distance, self, other, **kwargs)  # pylint: disable=E1132

        result = pd.Series(np.concatenate(list(result.values())))

        not_nan = ~np.isnan(result)
        result.loc[not_nan] = np.floor(result[not_nan] * 100) / 100
        vc = result.value_counts(dropna=False).to_frame().reset_index()
        vc.columns = "reldist count".split()
        vc.insert(vc.shape[1], "total", len(result))
        vc.insert(vc.shape[1], "fraction", vc["count"] / len(result))
        vc = vc.sort_values("reldist", ascending=True)
        vc = vc.reset_index(drop=True)

        return vc
Example #18
0
    def introns(self, by="gene"):

        kwargs = {"by": by}
        kwargs = pr.pyranges.fill_kwargs(kwargs)

        assert by in ["gene", "transcript"]

        id_column = by_to_id[by]
        gr = self.pr.sort(id_column)

        if not len(gr):
            return pr.PyRanges()

        exons = gr.subset(lambda df: df.Feature == "exon")
        exons = exons.merge(by=id_column)

        by_gr = gr.subset(lambda df: df.Feature == by)

        result = pyrange_apply(_introns2, by_gr, exons, **kwargs)

        return pr.PyRanges(result)
Example #19
0
    def relative_distance(self, other):
        """Compute spatial correllation between two sets.

        Metric which describes relative distance between each interval in one
        set and two closest intervals in another.

        Parameters
        ----------
        other : PyRanges

            Intervals to compare with.

        chromsizes : int, dict, DataFrame or PyRanges

            Integer representing genome length or mapping from chromosomes
            to its length.

        strandedness : {None, "same", "opposite", False}, default None, i.e. "auto"

            Whether to compute without regards to strand or on same or opposite.

        Returns
        -------
        pandas.DataFrame

            DataFrame containing the frequency of each relative distance.

        See Also
        --------

        pyranges.statistics.jaccard : compute the jaccard coefficient
        pyranges.statistics.forbes : compute the forbes coefficient

        Examples
        --------

        >>> gr, gr2 = pr.data.chipseq(), pr.data.chipseq_background()
        >>> chromsizes = pr.data.chromsizes()
        >>> gr.stats.relative_distance(gr2)
            reldist  count  total  fraction
        0      0.00    264   9956  0.026517
        1      0.01    226   9956  0.022700
        2      0.02    206   9956  0.020691
        3      0.03    235   9956  0.023604
        4      0.04    194   9956  0.019486
        5      0.05    241   9956  0.024207
        6      0.06    201   9956  0.020189
        7      0.07    191   9956  0.019184
        8      0.08    192   9956  0.019285
        9      0.09    191   9956  0.019184
        10     0.10    186   9956  0.018682
        11     0.11    203   9956  0.020390
        12     0.12    218   9956  0.021896
        13     0.13    209   9956  0.020992
        14     0.14    201   9956  0.020189
        15     0.15    178   9956  0.017879
        16     0.16    202   9956  0.020289
        17     0.17    197   9956  0.019787
        18     0.18    208   9956  0.020892
        19     0.19    202   9956  0.020289
        20     0.20    191   9956  0.019184
        21     0.21    188   9956  0.018883
        22     0.22    213   9956  0.021394
        23     0.23    192   9956  0.019285
        24     0.24    199   9956  0.019988
        25     0.25    181   9956  0.018180
        26     0.26    172   9956  0.017276
        27     0.27    191   9956  0.019184
        28     0.28    190   9956  0.019084
        29     0.29    192   9956  0.019285
        30     0.30    201   9956  0.020189
        31     0.31    212   9956  0.021294
        32     0.32    213   9956  0.021394
        33     0.33    177   9956  0.017778
        34     0.34    197   9956  0.019787
        35     0.35    163   9956  0.016372
        36     0.36    191   9956  0.019184
        37     0.37    198   9956  0.019888
        38     0.38    160   9956  0.016071
        39     0.39    188   9956  0.018883
        40     0.40    200   9956  0.020088
        41     0.41    188   9956  0.018883
        42     0.42    230   9956  0.023102
        43     0.43    197   9956  0.019787
        44     0.44    224   9956  0.022499
        45     0.45    184   9956  0.018481
        46     0.46    198   9956  0.019888
        47     0.47    187   9956  0.018783
        48     0.48    200   9956  0.020088
        49     0.49    194   9956  0.019486
        """

        self = self.pr

        kwargs = {}
        kwargs["sparse"] = {"self": True, "other": True}
        kwargs = pr.pyranges.fill_kwargs(kwargs)

        result = pyrange_apply(_relative_distance, self, other, **kwargs)  # pylint: disable=E1132

        result = pd.Series(np.concatenate(list(result.values())))

        not_nan = ~np.isnan(result)
        result.loc[not_nan] = np.floor(result[not_nan] * 100) / 100
        vc = result.value_counts(dropna=False).to_frame().reset_index()
        vc.columns = "reldist count".split()
        vc.insert(vc.shape[1], "total", len(result))
        vc.insert(vc.shape[1], "fraction", vc["count"] / len(result))
        vc = vc.sort_values("reldist", ascending=True)
        vc = vc.reset_index(drop=True)

        return vc
Example #20
0
    def introns(self, by="gene", nb_cpu=1):
        """Return the introns.

        Parameters
        ----------
        by : str, {"gene", "transcript"}, default "gene"
            Whether to find introns per gene or transcript.

        nb_cpu: int, default 1

            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
            Will only lead to speedups on large datasets.

        See Also
        --------
        pyranges.genomicfeatures.GenomicFeaturesMethods.tss : return the transcription start sites

        Examples
        --------

        >>> gr = pr.data.ensembl_gtf()
        >>> gr
        +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+
        | Chromosome   | Source     | Feature      | Start     | End       | Score      | Strand       | Frame      | gene_biotype                       | +19   |
        | (category)   | (object)   | (category)   | (int32)   | (int32)   | (object)   | (category)   | (object)   | (object)                           | ...   |
        |--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------|
        | 1            | havana     | gene         | 11868     | 14409     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
        | 1            | havana     | transcript   | 11868     | 14409     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
        | 1            | havana     | exon         | 11868     | 12227     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
        | 1            | havana     | exon         | 12612     | 12721     | .          | +            | .          | transcribed_unprocessed_pseudogene | ...   |
        | ...          | ...        | ...          | ...       | ...       | ...        | ...          | ...        | ...                                | ...   |
        | 1            | havana     | gene         | 1173055   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
        | 1            | havana     | transcript   | 1173055   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
        | 1            | havana     | exon         | 1179364   | 1179555   | .          | -            | .          | lncRNA                             | ...   |
        | 1            | havana     | exon         | 1173055   | 1176396   | .          | -            | .          | lncRNA                             | ...   |
        +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+
        Stranded PyRanges object has 2,446 rows and 28 columns from 1 chromosomes.
        For printing, the PyRanges was sorted on Chromosome and Strand.
        19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.)

        >>> gr.features.introns(by="gene")
        +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------+
        | Chromosome   | Source         | Feature    | Start     | End       | Score      | Strand       | Frame      | +20   |
        | (object)     | (object)       | (object)   | (int32)   | (int32)   | (object)   | (category)   | (object)   | ...   |
        |--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------|
        | 1            | ensembl_havana | intron     | 1173926   | 1174265   | .          | +            | .          | ...   |
        | 1            | ensembl_havana | intron     | 1174321   | 1174423   | .          | +            | .          | ...   |
        | 1            | ensembl_havana | intron     | 1174489   | 1174520   | .          | +            | .          | ...   |
        | 1            | ensembl_havana | intron     | 1175034   | 1179188   | .          | +            | .          | ...   |
        | ...          | ...            | ...        | ...       | ...       | ...        | ...          | ...        | ...   |
        | 1            | havana         | intron     | 874591    | 875046    | .          | -            | .          | ...   |
        | 1            | havana         | intron     | 875155    | 875525    | .          | -            | .          | ...   |
        | 1            | havana         | intron     | 875625    | 876526    | .          | -            | .          | ...   |
        | 1            | havana         | intron     | 876611    | 876754    | .          | -            | .          | ...   |
        +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------+
        Stranded PyRanges object has 311 rows and 28 columns from 1 chromosomes.
        For printing, the PyRanges was sorted on Chromosome and Strand.
        20 hidden columns: gene_biotype, gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, ... (+ 11 more.)

        >>> gr.features.introns(by="transcript")
        +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------+
        | Chromosome   | Source         | Feature    | Start     | End       | Score      | Strand       | Frame      | gene_biotype                     | +19   |
        | (object)     | (object)       | (object)   | (int32)   | (int32)   | (object)   | (category)   | (object)   | (object)                         | ...   |
        |--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------|
        | 1            | havana         | intron     | 818202    | 818722    | .          | +            | .          | lncRNA                           | ...   |
        | 1            | ensembl_havana | intron     | 960800    | 961292    | .          | +            | .          | protein_coding                   | ...   |
        | 1            | ensembl_havana | intron     | 961552    | 961628    | .          | +            | .          | protein_coding                   | ...   |
        | 1            | ensembl_havana | intron     | 961750    | 961825    | .          | +            | .          | protein_coding                   | ...   |
        | ...          | ...            | ...        | ...       | ...       | ...        | ...          | ...        | ...                              | ...   |
        | 1            | havana         | intron     | 732207    | 732980    | .          | -            | .          | transcribed_processed_pseudogene | ...   |
        | 1            | havana_tagene  | intron     | 168165    | 169048    | .          | -            | .          | lncRNA                           | ...   |
        | 1            | havana_tagene  | intron     | 165942    | 167958    | .          | -            | .          | lncRNA                           | ...   |
        | 1            | havana_tagene  | intron     | 168165    | 169048    | .          | -            | .          | lncRNA                           | ...   |
        +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------+
        Stranded PyRanges object has 1,043 rows and 28 columns from 1 chromosomes.
        For printing, the PyRanges was sorted on Chromosome and Strand.
        19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.)
        """

        kwargs = {"by": by, "nb_cpu": nb_cpu}
        kwargs = pr.pyranges.fill_kwargs(kwargs)

        assert by in ["gene", "transcript"]

        id_column = by_to_id[by]
        gr = self.pr.sort(id_column)

        if not len(gr):
            return pr.PyRanges()

        exons = gr.subset(lambda df: df.Feature == "exon")
        exons = exons.merge(by=id_column)

        by_gr = gr.subset(lambda df: df.Feature == by)

        result = pyrange_apply(_introns2, by_gr, exons, **kwargs)

        return pr.PyRanges(result)
Example #21
0
    def k_nearest(self, other, k=1, **kwargs):

        from pyranges.methods.k_nearest import _nearest
        from sorted_nearest import get_all_ties, get_different_ties

        kwargs = fill_kwargs(kwargs)
        kwargs["stranded"] = self.stranded and other.stranded

        overlap = kwargs.get("overlap", True)
        ties = kwargs.get("ties", False)

        self = pr.PyRanges({k: v.copy() for k, v in self.dfs.items()})

        try: # if k is an array
            k = k.values
        except:
            pass

        self.__k__ = k
        self.__IX__ = np.arange(len(self))


        # from time import time
        # start = time()
        dfs = pyrange_apply(_nearest, self, other, **kwargs)
        # end = time()
        # print("nearest", end - start)

        nearest = PyRanges(dfs)
        # nearest.msp()
        # raise
        # print("nearest len", len(nearest))

        if not overlap:
            # self = self.drop(like="__k__|__IX__")
            result = nearest#.drop(like="__k__|__IX__")
        else:
            from collections import defaultdict
            overlap_kwargs = {k: v for k, v in kwargs.items()}
            # print("kwargs ties:", kwargs.get("ties"))
            overlap_kwargs["how"] = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")]
            # start = time()
            overlaps = self.join(other, **overlap_kwargs)
            # end = time()
            # print("overlaps", end - start)
            overlaps.Distance = 0
            # print("overlaps len", len(overlaps))

            result = pr.concat([overlaps, nearest])

        if not len(result):
            return pr.PyRanges()
        # print(result)
        # print(overlaps.drop(like="__").df)
        # raise

        # start = time()
        new_result = {}
        if ties in ["first", "last"]:
            # method = "tail" if ties == "last" else "head"
            # keep = "last" if ties == "last" else "first"

            for c, df in result:
                # start = time()
                # print(c)
                # print(df)

                df = df.sort_values(["__IX__", "Distance"])
                grpby = df.groupby("__k__", sort=False)
                dfs = []
                for k, kdf in grpby:
                    # print("k", k)
                    # print(kdf)
                    # dist_bool = ~kdf.Distance.duplicated(keep=keep)
                    # print(dist_bool)
                    # kdf = kdf[dist_bool]
                    grpby2 = kdf.groupby("__IX__", sort=False)
                    # f = getattr(grpby2, method)
                    _df = grpby2.head(k)
                    # print(_df)
                    dfs.append(_df)
                # raise

                if dfs:
                    new_result[c] = pd.concat(dfs)
                # print(new_result[c])
        elif ties == "different" or not ties:
            for c, df in result:

                # print(df)

                if df.empty:
                    continue
                dfs = []

                df = df.sort_values(["__IX__", "Distance"])
                grpby = df.groupby("__k__", sort=False)

                # for each index
                # want to keep until we have k
                # then keep all with same distance
                for k, kdf in grpby:
                    # print("kdf " * 10)
                    # print("k " * 5, k)
                    # print(kdf["__IX__ Distance".split()])
                    # print(kdf.dtypes)
                    # print(kdf.index.dtypes)
                    # if ties:
                    if ties:
                        lx = get_different_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    else:
                        lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    # print(lx)


                    # else:
                    #     lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    _df = kdf.reindex(lx)
                    # print("_df", _df)
                    dfs.append(_df)

                if dfs:
                    new_result[c] = pd.concat(dfs)

        result = pr.PyRanges(new_result)

        if not result.__IX__.is_monotonic:
            result = result.sort("__IX__")

        result = result.drop(like="__IX__|__k__")

        self = self.drop(like="__k__|__IX__")

        def prev_to_neg(df, kwargs):

            strand = df.Strand.iloc[0] if "Strand" in df else "+"

            suffix = kwargs["suffix"]

            bools = df["End" + suffix] < df.Start
            if not strand == "+":
                bools = ~bools

            df.loc[bools, "Distance"] = -df.loc[bools, "Distance"]
            return df

        # print(result)
        result = result.apply(prev_to_neg, suffix=kwargs["suffix"])
        # print(result)

        # end = time()
        # print("final stuff", end - start)

        return result