Ejemplo n.º 1
0
def assign_regions(features, supports):
    """
    For each feature in features dataframe assign the genomic region (support)
    that overlaps with it. In case if feature overlaps multiple supports, the
    region with largest overlap will be reported.
    """

    index_name = features.index.name  # Store the name of index
    features = (
        features.copy().reset_index()
    )  # Store the original features' order as a column with original index

    if "chrom" in features.columns:
        overlap = bioframe.overlap(
            features,
            supports,
            how="left",
            cols1=["chrom", "start", "end"],
            cols2=["chrom", "start", "end"],
            keep_order=True,
            return_overlap=True,
        )
        overlap_columns = overlap.columns  # To filter out duplicates later
        overlap["overlap_length"] = overlap["overlap_end"] - overlap["overlap_start"]
        # Filter out overlaps with multiple regions:
        overlap = (
            overlap.sort_values("overlap_length", ascending=False)
            .drop_duplicates(overlap_columns, keep="first")
            .sort_index()
        )
        # Copy single column with overlapping region name:
        features["region"] = overlap["name_2"]

    if "chrom1" in features.columns:
        for idx in ("1", "2"):
            overlap = bioframe.overlap(
                features,
                supports,
                how="left",
                cols1=[f"chrom{idx}", f"start{idx}", f"end{idx}"],
                cols2=[f"chrom", f"start", f"end"],
                keep_order=True,
                return_overlap=True,
            )
            overlap_columns = overlap.columns  # To filter out duplicates later
            overlap[f"overlap_length{idx}"] = (
                overlap[f"overlap_end{idx}"] - overlap[f"overlap_start{idx}"]
            )
            # Filter out overlaps with multiple regions:
            overlap = (
                overlap.sort_values(f"overlap_length{idx}", ascending=False)
                .drop_duplicates(overlap_columns, keep="first")
                .sort_index()
            )
            # Copy single column with overlapping region name:
            features[f"region{idx}"] = overlap["name_2"]

        # Form a single column with region names where region1 == region2, and np.nan in other cases:
        features["region"] = np.where(
            features["region1"] == features["region2"], features["region1"], np.nan
        )
        features = features.drop(
            ["region1", "region2"], axis=1
        )  # Remove unnecessary columns

    features = features.set_index(
        index_name if not index_name is None else "index"
    )  # Restore the original index
    features.index.name = index_name  # Restore original index title
    return features
Ejemplo n.º 2
0
def _assign_supports(features, supports):
    """assigns supports to entries in snipping windows.
    Workaround for bug in cooltools 0.2.0 that duplicate
    supports are not handled correctly. Copied from cooltools.common.assign_regions"""
    index_name = features.index.name  # Store the name of index
    features = (
        features.copy().reset_index()
    )  # Store the original features' order as a column with original index

    if "chrom" in features.columns:
        overlap = bioframe.overlap(
            features,
            supports,
            how="left",
            cols1=["chrom", "start", "end"],
            cols2=["chrom", "start", "end"],
            keep_order=True,
            return_overlap=True,
        )
        overlap_columns = [
            "index_1",
            "chrom_1",
            "start_1",
            "end_1",
        ]  # To filter out duplicates later
        overlap["overlap_length"] = overlap["overlap_end"] - overlap[
            "overlap_start"]
        # Filter out overlaps with multiple regions:
        overlap = (overlap.sort_values(
            "overlap_length", ascending=False).drop_duplicates(
                overlap_columns,
                keep="first").sort_index()).reset_index(drop=True)
        # Copy single column with overlapping region name:
        features["region"] = overlap["name_2"]

    if "chrom1" in features.columns:
        for idx in ("1", "2"):
            overlap = bioframe.overlap(
                features,
                supports,
                how="left",
                cols1=[f"chrom{idx}", f"start{idx}", f"end{idx}"],
                cols2=[f"chrom", f"start", f"end"],
                keep_order=True,
                return_overlap=True,
            )
            overlap_columns = [
                "index_1",
                f"chrom{idx}_1",
                f"start{idx}_1",
                f"end{idx}_1",
            ]  # To filter out duplicates later
            overlap[f"overlap_length{idx}"] = (overlap[f"overlap_end{idx}"] -
                                               overlap[f"overlap_start{idx}"])
            # Filter out overlaps with multiple regions:
            overlap = (overlap.sort_values(
                f"overlap_length{idx}", ascending=False).drop_duplicates(
                    overlap_columns,
                    keep="first").sort_index()).reset_index(drop=True)
            # Copy single column with overlapping region name:
            features[f"region{idx}"] = overlap["name_2"]

        # Form a single column with region names where region1 == region2, and np.nan in other cases:
        features["region"] = np.where(
            features["region1"] == features["region2"], features["region1"],
            np.nan)
        features = features.drop(["region1", "region2"],
                                 axis=1)  # Remove unnecessary columns

    features = features.set_index(index_name if not index_name is None else
                                  "index")  # Restore the original index
    features.index.name = index_name  # Restore original index title
    return features