Example #1
0
    def __init__(self, clr, expected, cooler_opts=None, view_df=None):
        self.clr = clr
        self.expected = expected

        # Detecting the columns for the detection of regions
        columns = expected.columns
        assert len(columns) > 0
        if ("region1" not in columns) or ("region2" not in columns):
            if ("chrom" in columns) or ("region" in columns):
                raise ValueError(
                    "Provided expected appears to have old format, it has to comply with the format of expected v1.0"
                )
            else:
                raise ValueError(
                    "Please check the expected dataframe, it has to comply with the format of expected v1.0"
                )

        # get chromosomes from cooler, if view_df not specified:
        if view_df is None:
            view_df = bioframe.make_viewframe([
                (chrom, 0, l, chrom) for chrom, l in clr.chromsizes.items()
            ])
        else:
            # appropriate viewframe checks:
            if not bioframe.is_viewframe(view_df):
                raise ValueError("view_df is not a valid viewframe.")
            if not bioframe.is_contained(
                    view_df, bioframe.make_viewframe(clr.chromsizes)):
                raise ValueError(
                    "view_df is out of the bounds of chromosomes in cooler.")

        self.view_df = view_df.set_index("name")

        for (name1,
             name2), group in self.expected.groupby(["region1", "region2"]):
            if name1 != name2:
                raise ValueError(
                    "Only symmetric regions a supported, e.g. chromosomes, arms, etc"
                )
            n_diags = group.shape[0]
            region = self.view_df.loc[name1]
            lo, hi = self.clr.extent(region)
            if n_diags != (hi - lo):
                raise ValueError(
                    "Region shape mismatch between expected and cooler. "
                    "Are they using the same resolution?")

        self.binsize = self.clr.binsize
        self.offsets = {}
        self.pad = True
        self.cooler_opts = {} if cooler_opts is None else cooler_opts
        self.cooler_opts.setdefault("sparse", True)
Example #2
0
def _make_cooler_view(view_df, clr):
    try:
        if not bioframe.is_viewframe(view_df, raise_errors=True):
            raise ValueError("view_df is not a valid viewframe.")
    except Exception as e:  # AssertionError or ValueError, see https://github.com/gfudenberg/bioframe/blob/main/bioframe/core/checks.py#L177
        warnings.warn(
            "view_df has to be a proper viewframe from next release",
            DeprecationWarning,
            stacklevel=2,
        )
        view_df = bioframe.make_viewframe(view_df)
    if not bioframe.is_contained(view_df,
                                 bioframe.make_viewframe(clr.chromsizes)):
        raise ValueError(
            "View table is out of the bounds of chromosomes in cooler.")
    return view_df
Example #3
0
def read_viewframe(
    fname,
    verify_cooler_view=None,
):
    """
    Read a BED file with regions that conforms
    a definition of a viewframe (non-overlaping, unique names, etc).

    Parameters
    ----------
    fname : str
        Path to a BED file with regions.
    verify_cooler_view : None or viewframe
        Viewframe with entire chromosome sizes

    Returns
    -------
    view_df : pd.DataFrame
        DataFrame with the viewframe
    """
    # define chromsizes based on verify_cooler_view
    chromsizes = None if (verify_cooler_view is None) else \
        verify_cooler_view.set_index("chrom")["end"]
    # read BED file assuming bed4/3 formats (with names-columns and without):
    try:
        view_df = bioframe.read_table(fname, schema="bed4", index_col=False)
    except Exception:
        view_df = bioframe.read_table(fname, schema="bed3", index_col=False)
    # Convert view dataframe to viewframe:
    try:
        view_df = bioframe.make_viewframe(view_df) if (verify_cooler_view is None) else \
            bioframe.make_viewframe(view_df, check_bounds=chromsizes)
    except ValueError as e:
        raise ValueError(
            "View table is incorrect, please, comply with the format. ") from e

    # Check that input view is contained in cooler bounds, but not vice versa (because cooler may have more regions):
    if verify_cooler_view is not None:
        if not bioframe.is_contained(view_df, verify_cooler_view):
            raise ValueError(
                "View regions are not contained in cooler chromsizes bounds")

    return view_df
Example #4
0
    def __init__(self, clr, cooler_opts=None, view_df=None):

        # get chromosomes from bins, if view_df not specified:
        if view_df is None:
            view_df = bioframe.make_viewframe([
                (chrom, 0, l, chrom) for chrom, l in clr.chromsizes.items()
            ])
        else:
            # appropriate viewframe checks:
            if not bioframe.is_viewframe(view_df):
                raise ValueError("view_df is not a valid viewframe.")
            if not bioframe.is_contained(
                    view_df, bioframe.make_viewframe(clr.chromsizes)):
                raise ValueError(
                    "view_df is out of the bounds of chromosomes in cooler.")

        self.view_df = view_df.set_index("name")

        self.clr = clr
        self.binsize = self.clr.binsize
        self.offsets = {}
        self.pad = True
        self.cooler_opts = {} if cooler_opts is None else cooler_opts
        self.cooler_opts.setdefault("sparse", True)
Example #5
0
def is_compatible_viewframe(view_df,
                            verify_cooler,
                            check_sorting=False,
                            raise_errors=False):
    """
    Check if view_df is a viewframe and if
    it is compatible with the provided cooler.

    Parameters
    ----------
    view_df :  DataFrame
        view_df DataFrame to be validated
    verify_cooler : cooler
        cooler object to use for verification
    check_sorting : bool
        Check is regions in view_df are sorted as in
        chromosomes in cooler.
    raise_errors : bool
        raise expection instead of returning False

    Returns
    -------
    is_compatible_viewframe : bool
        True when view_df is compatible, False otherwise
    """
    try:
        try:
            _ = bioframe.is_viewframe(view_df, raise_errors=True)
        except Exception as error_not_viewframe:
            try:
                _ = bioframe.make_viewframe(view_df)
            except Exception as error_cannot_make_viewframe:
                # view_df is not viewframe and cannot be easily converted
                raise ValueError(
                    "view_df is not a valid viewframe and cannot be recovered"
                ) from error_cannot_make_viewframe
            else:
                # view_df is not viewframe, but can be converted - formatting issue ? name-column ?
                raise ValueError(
                    "view_df is not a valid viewframe, apply bioframe.make_viewframe to convert"
                ) from error_not_viewframe

        # is view_df contained inside cooler-chromosomes ?
        cooler_view = make_cooler_view(verify_cooler)
        if not bioframe.is_contained(view_df, cooler_view, raise_errors=False):
            raise ValueError(
                "View table is out of the bounds of chromosomes in cooler.")

        # is view_df sorted by coord and chrom order as in cooler ?
        if check_sorting:
            if not bioframe.is_sorted(
                    view_df, cooler_view, df_view_col="chrom"):
                raise ValueError(
                    "regions in the view_df must be sorted by coordinate"
                    " and chromosomes order as as in the verify_cooler.")

    except Exception as e:
        if raise_errors:
            raise ValueError(
                "view_df is not compatible, or not a viewframe") from e
        else:
            # something went wrong: it's not a viewframe
            return False
    else:
        # no exceptions were raised: it's a compatible viewframe
        return True
Example #6
0
        feature_type = "bed"
    elif {"chrom1", "start1", "end1", "chrom2", "start2",
          "end1"}.issubset(features_df.columns):
        feature_type = "bedpe"
    else:
        raise ValueError("Unknown feature_df format")
    if flank is not None:
        features_df = expand_align_features(features_df,
                                            flank,
                                            clr.binsize,
                                            format=feature_type)

    if view_df is None:
        view_df = bioframe.make_viewframe(clr.chromsizes)
    else:
        if not bioframe.is_contained(view_df,
                                     bioframe.make_viewframe(clr.chromsizes)):
            raise ValueError(
                "view_df is out of the bounds of chromosomes in cooler.")

    features_df = assign_regions(features_df, view_df)

    # TODO Expected checks are now implemented in the snippers, maybe move them out to here
    # when there is a neat function?

    if expected_df is None:
        snipper = CoolerSnipper(clr, view_df=view_df)
    else:
        snipper = ObsExpSnipper(clr, expected_df, view_df=view_df)

    if nproc > 1:
        pool = multiprocessing.Pool(nproc)
Example #7
0
def cooler_cis_eig(
    clr,
    bins,
    view_df=None,
    n_eigs=3,
    phasing_track_col="GC",
    balance="weight",
    ignore_diags=None,
    bad_bins=None,
    clip_percentile=99.9,
    sort_metric=None,
    map=map,
):
    """
    Compute compartment eigenvector for a given cooler `clr` in a number of
    symmetric intra chromosomal regions defined in view_df (cis-regions), or for each
    chromosome.
    Note that the amplitude of compartment eigenvectors is weighted by their
    corresponding eigenvalue
    Parameters
    ----------
    clr : cooler
        cooler object to fetch data from
    bins : DataFrame
        table of bins derived from clr with phasing track added
    view_df : iterable or DataFrame, optional
        if provided, eigenvectors are calculated for the regions of the view only,
        otherwise chromosome-wide eigenvectors are computed, for chromosomes
        specified in bins.
    n_eigs : int
        number of eigenvectors to compute
    phasing_track_col : str, optional
        name of the columns in `bins` table, if provided, eigenvectors are
        flipped to achieve a positive correlation with `bins[phasing_track_col]`.
    balance : str
        name of the column with balancing weights to be used.
    ignore_diags : int, optional
        the number of diagonals to ignore. Derived from cooler metadata
        if not specified.
    bad_bins : array-like
        a list of bins to ignore. Indexes of bins must be absolute,
        as in clr.bins()[:], as opposed to being offset by chromosome start.
        `bad_bins` will be combined with the bad bins masked by balancing.
    clip_percentile : float
        if >0 and <100, clip pixels with diagonal-normalized values
        higher than the specified percentile of matrix-wide values.
    sort_metric : str
        If provided, re-sort `eigenvecs` and `eigvals` in the order of
        decreasing correlation between phasing_track and eigenvector, using the
        specified measure of correlation. Possible values:
        'pearsonr' - sort by decreasing Pearson correlation.
        'var_explained' - sort by decreasing absolute amount of variation in
        `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec))
        'MAD_explained' - sort by decreasing absolute amount of Median Absolute
        Deviation from the median of `eigvecs` explained by `phasing_track`
        (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)).
        'spearmanr' - sort by decreasing Spearman correlation.
        This option is designed to report the most "biologically" informative
        eigenvectors first, and prevent eigenvector swapping caused by
        translocations. In reality, however, sometimes it shows poor
        performance and may lead to reporting of non-informative eigenvectors.
        Off by default.
    map : callable, optional
        Map functor implementation.
    Returns
    -------
    eigvals, eigvec_table -> DataFrames with eigenvalues for each region and
    a table of eigenvectors filled in the `bins` table.
    .. note:: ALWAYS check your EVs by eye. The first one occasionally does
              not reflect the compartment structure, but instead describes
              chromosomal arms or translocation blowouts. Possible mitigations:
              employ `view_df` (e.g. arms) to avoid issues with chromosomal arms,
              use `bad_bins` to ignore small transolcations.
    """

    # get chromosomes from cooler, if view_df not specified:
    if view_df is None:
        view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom])
                                           for chrom in clr.chromnames])
    else:
        # appropriate viewframe checks:
        if not bioframe.is_viewframe(view_df):
            raise ValueError("view_df is not a valid viewframe.")
        if not bioframe.is_contained(view_df,
                                     bioframe.make_viewframe(clr.chromsizes)):
            raise ValueError(
                "view_df is out of the bounds of chromosomes in cooler.")

    # make sure phasing_track_col is in bins, if phasing is requested
    if phasing_track_col and (phasing_track_col not in bins):
        raise ValueError(f'No column "{phasing_track_col}" in the bin table')

    # ignore diags as in cooler inless specified
    ignore_diags = (clr._load_attrs("bins/weight").get("ignore_diags", 2)
                    if ignore_diags is None else ignore_diags)

    # prepare output table for eigen vectors
    eigvec_table = bins.copy()
    eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
    for ev_col in eigvec_columns:
        eigvec_table[ev_col] = np.nan

    # prepare output table for eigenvalues
    eigvals_table = view_df.copy()
    eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
    for eval_col in eigval_columns:
        eigvals_table[eval_col] = np.nan

    def _each(region):
        """
        perform eigen decomposition for a given region
        assuming safety checks are done outside of this
        function.
        Parameters
        ----------
        region: tuple-like
            tuple of the form (chroms,start,end,*)
        Returns
        -------
        _region, eigvals, eigvecs -> ndarrays
            array of eigenvalues and an array eigenvectors
        """
        _region = region[:3]  # take only (chrom, start, end)
        A = clr.matrix(balance=balance).fetch(_region)

        # filter bad_bins relevant for the _region from A
        if bad_bins is not None:
            # filter bad_bins for the _region and turn relative:
            lo, hi = clr.extent(_region)
            bad_bins_region = bad_bins[(bad_bins >= lo) & (bad_bins < hi)]
            bad_bins_region -= lo
            if len(bad_bins_region) > 0:
                # apply bad bins to symmetric matrix A:
                A[:, bad_bins_region] = np.nan
                A[bad_bins_region, :] = np.nan

        # extract phasing track relevant for the _region
        phasing_track = (bioframe.select(bins,
                                         _region)[phasing_track_col].values
                         if phasing_track_col else None)

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
        )

        return _region, eigvals, eigvecs

    # eigendecompose matrix per region (can be multiprocessed)
    # output assumes that the order of results matches regions
    results = map(_each, view_df.values)

    # go through eigendecomposition results and fill in
    # output table eigvec_table and eigvals_table
    for _region, _eigvals, _eigvecs in results:
        idx = bioframe.select(eigvec_table, _region).index
        eigvec_table.at[idx, eigvec_columns] = _eigvecs.T
        idx = bioframe.select(eigvals_table, _region).index
        eigvals_table.at[idx, eigval_columns] = _eigvals

    return eigvals_table, eigvec_table
Example #8
0
def compute_pileup(
    cool_path,
    features,
    view,
    expected,
    flank,
    features_format,
    weight_name,
    out,
    out_format,
    store_snips,
    nproc,
    ignore_diags,
    aggregate,
    force,
    verbose,
):
    """
    Perform retrieval of the snippets from .cool file.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    FEATURES_PATH : the path to a BED or BEDPE-like file that contains features for snipping windows.
    If BED, then the features are on-diagonal. If BEDPE, then the features
    can be off-diagonal (but not in trans or between different regions in the view).

    """

    clr = cooler.Cooler(cool_path)

    #### Read the features:
    buf, names = sniff_for_header(features)
    if features_format.lower() == "bedpe":
        default_cols = [0, 1, 2, 3, 4, 5]
        bedpe_cols = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
        dtypes = {
            "chrom1": str,
            "start1": np.int64,
            "end1": np.int64,
            "chrom2": str,
            "start2": np.int64,
            "end2": np.int64,
        }
        if names is None:
            kwargs = dict(
                header=None,
                usecols=default_cols,
                dtype=dtypes,
                names=bedpe_cols,
            )
        else:
            kwargs = dict(header="infer", usecols=bedpe_cols)
    elif features_format.lower() == "bed":
        default_cols = [0, 1, 2]
        bed_cols = ["chrom", "start", "end"]
        dtypes = {"chrom": str, "start": np.int64, "end": np.int64}
        if names is None:
            kwargs = dict(
                header=None,
                names=bed_cols,
            )
        else:
            kwargs = dict(header="infer", usecols=bed_cols)
    else:
        raise ValueError(
            "Automatic detection of features format is not implemented yet. "
            "Please provide BED or BEDPE as --features-format")

    features_df = pd.read_table(buf,
                                comment="#",
                                usecols=default_cols,
                                dtype=dtypes,
                                verbose=verbose,
                                **kwargs)

    ###### Define view for cis compartment-calling
    # use input "view" BED file or all chromosomes mentioned in "track":
    if view is None:
        # Generate viewframe from clr.chromsizes:
        view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom])
                                           for chrom in clr.chromnames])
        if not bioframe.is_contained(features_df, view_df):
            raise ValueError(
                "Features are not contained in chromosomes bounds")
    else:
        # Make viewframe out of table:
        # Read view_df:
        try:
            view_df = bioframe.read_table(view, schema="bed4", index_col=False)
        except Exception:
            view_df = bioframe.read_table(view, schema="bed3", index_col=False)
        # Convert view_df to viewframe:
        try:
            view_df = bioframe.make_viewframe(view_df,
                                              check_bounds=clr.chromsizes)
        except ValueError as e:
            raise ValueError(
                "View table is incorrect, please, comply with the format. "
            ) from e

    if not bioframe.is_contained(features_df, view_df):
        raise ValueError("Features are not contained in view bounds")

    ##### Read expected, should be cis-expected:
    if not expected is None:
        expected_path, expected_value_col = expected
        expected_summary_cols = [
            expected_value_col,
        ]
        expected = read_expected(
            expected_path,
            contact_type="cis",
            expected_value_cols=expected_summary_cols,
            verify_view=view_df,
            verify_cooler=clr,
        )

    ##### CReate the pileup:
    stack = snipping.pileup(
        clr,
        features_df,
        view_df=view_df,
        expected_df=expected,
        flank=flank,
        min_diag=ignore_diags,  # TODO: implement in pileup API
        clr_weight_name=weight_name,  # TODO: implement in pileup API
        force=force,  # TODO: implement in pileup API
        nproc=nproc,
    )

    ##### Aggregate the signal:
    aggregate = aggregate.lower()
    if aggregate is None or aggregate == "mean" or aggregate == "none":
        agg_func = np.nanmean
    elif aggregate == "median":
        agg_func = np.nanmedian
    elif aggregate == "min":
        agg_func = np.nanmin
    elif aggregate == "max":
        agg_func = np.nanmax
    elif aggregate == "std":
        agg_func = np.nanstd
    else:
        raise ValueError(
            f"Aggregation mode {aggregate} not supported. Please use mean/median/min/max/std."
        )

    pileup = agg_func(stack, axis=2)

    ##### Store the data as NPZ file:
    if out_format.lower() == "npz":
        if store_snips:
            np.savez(out, pileup=pileup)
        else:
            np.savez(out, pileup=pileup, stack=stack)
    elif out_format.lower() == "hdf5":
        h5 = h5py.File(out, "w")
        h5.create_dataset("pileup", data=pileup)
        if store_snips:
            h5.create_dataset("stack", data=stack)
Example #9
0
def compute_saddle(
    cool_path,
    track_path,
    expected_path,
    contact_type,
    min_dist,
    max_dist,
    n_bins,
    vrange,
    qrange,
    clr_weight_name,
    strength,
    view,
    out_prefix,
    fig,
    scale,
    cmap,
    vmin,
    vmax,
    hist_color,
    verbose,
):
    """
    Calculate saddle statistics and generate saddle plots for an arbitrary
    signal track on the genomic bins of a contact matrix.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    TRACK_PATH : The path to bedGraph-like file with a binned compartment track
    (eigenvector), including a header. Use the '::' syntax to specify a column
    name.

    EXPECTED_PATH : The paths to a tsv-like file with expected signal,
    including a header. Use the '::' syntax to specify a column name.

    Analysis will be performed for chromosomes referred to in TRACK_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH and EXPECTED_PATH.

    COOL_PATH, TRACK_PATH and EXPECTED_PATH must be binned at the same
    resolution (expect for  EXPECTED_PATH in case of trans contact type).

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'chrom', 'diag', 'n_valid', value_name and the following columns for trans
    contacts: 'chrom1', 'chrom2', 'n_valid', value_name value_name is controlled
    using options. Header must be present in a file.

    """
    #### Read inputs: ####
    clr = cooler.Cooler(cool_path)
    expected_path, expected_value_col = expected_path
    track_path, track_name = track_path

    #### Read track: ####
    # read bedGraph-file :
    track_columns = ["chrom", "start", "end", track_name]
    # specify dtype as a rudimentary form of validation:
    track_dtype = {
        "chrom": np.str,
        "start": np.int64,
        "end": np.int64,
        track_name: np.float64,
    }
    track = pd.read_table(
        track_path,
        usecols=track_columns,
        dtype=track_dtype,
        comment=None,
        verbose=verbose,
    )

    #### Generate viewframes ####
    # 1:cooler_view_df. Generate viewframe from clr.chromsizes:
    cooler_view_df = bioframe.make_viewframe(clr.chromsizes)

    # 2:view_df. Define global view for calculating calling dots
    # use input "view" BED file or all chromosomes :
    view_df = cooler_view_df if (view is None) else read_viewframe(
        view, cooler_view_df)

    # 3:track_view_df. Generate viewframe from track table:
    track_view_df = bioframe.make_viewframe([
        (group.chrom.iloc[0], np.nanmin(group.start), np.nanmax(group.end))
        for i, group in track.reset_index().groupby("chrom")
    ])

    #### Read expected: ####
    expected_summary_cols = [
        expected_value_col,
    ]
    expected = read_expected(
        expected_path,
        contact_type=contact_type,
        expected_value_cols=expected_summary_cols,
        verify_view=view_df,
        verify_cooler=clr,
    )
    # add checks to make sure cis-expected is symmetric

    #############################################
    # CROSS-VALIDATE viewframes of COOLER, TRACK and EXPECTED:
    # Scheme: view <= {track_view, expected} <= cooler_view
    #############################################

    # Track is contained in cooler bounds, but not vice versa (because cooler may have more regions):
    if not bioframe.is_contained(track_view_df, cooler_view_df):
        raise ValueError(
            "Track regions are not contained in cooler chromsizes bounds")

    # View is contained in track bounds, but not vice versa (because track may have more regions):
    if not bioframe.is_contained(view_df, track_view_df):
        raise ValueError(
            "View table does not have some regions annotated in the track")

    #############################################
    # CROSS-VALIDATION IS COMPLETE.
    #############################################

    if min_dist < 0:
        min_diag = 3
    else:
        min_diag = int(np.ceil(min_dist / clr.binsize))

    if max_dist >= 0:
        max_diag = int(np.floor(max_dist / clr.binsize))
    else:
        max_diag = -1

    track = saddle.mask_bad_bins((track, track_name),
                                 (clr.bins()[:], clr_weight_name))
    if vrange[0] is None:
        vrange = None
    if qrange[0] is None:
        qrange = None

    digitized, binedges = saddle.get_digitized(
        track[["chrom", "start", "end", track_name]],
        n_bins,
        vrange=vrange,
        qrange=qrange,
        digitized_suffix=".d",
    )
    S, C = saddle.get_saddle(
        clr,
        expected,
        digitized[["chrom", "start", "end", track_name + ".d"]],
        contact_type,
        view_df=view_df,
        clr_weight_name=clr_weight_name,
        expected_value_col=expected_value_col,
        view_name_col="name",
        min_diag=min_diag,
        max_diag=max_diag,
        verbose=verbose,
    )
    saddledata = S / C

    to_save = dict(saddledata=saddledata,
                   binedges=binedges,
                   digitized=digitized,
                   saddlecounts=C)

    if strength:
        ratios = saddle.saddle_strength(S, C)
        ratios = ratios[1:-1]  # drop outlier bins
        to_save["saddle_strength"] = ratios

    # Save data
    np.savez(out_prefix + ".saddledump", **to_save)  # .npz auto-added
    digitized.to_csv(out_prefix + ".digitized.tsv", sep="\t", index=False)

    # Generate figure
    if len(fig):
        try:
            import matplotlib as mpl

            mpl.use("Agg")  # savefig only for now:
            import matplotlib.pyplot as plt
        except ImportError:
            print("Install matplotlib to use ", file=sys.stderr)
            sys.exit(1)

        if hist_color is None:
            color = (
                0.41568627450980394,
                0.8,
                0.39215686274509803,
            )  # sns.color_palette('muted')[2]
        else:
            color = mpl.colors.colorConverter.to_rgb(hist_color)
        title = op.basename(cool_path) + " ({})".format(contact_type)

        if qrange is not None:
            track_label = track_name + " quantiles"
        else:
            track_label = track_name

        clabel = "(contact frequency / expected)"

        saddle.saddleplot(
            track,
            saddledata,
            n_bins,
            vrange=vrange,
            qrange=qrange,
            scale=scale,
            vmin=vmin,
            vmax=vmax,
            color=color,
            title=title,
            xlabel=track_label,
            ylabel=track_label,
            clabel=clabel,
            cmap=cmap,
        )

        for ext in fig:
            plt.savefig(out_prefix + "." + ext, bbox_inches="tight")
Example #10
0
def pileup(
    cool_path,
    features,
    view,
    expected,
    flank,
    features_format,
    clr_weight_name,
    out,
    out_format,
    store_snips,
    nproc,
    ignore_diags,
    aggregate,
    verbose,
):
    """
    Perform retrieval of the snippets from .cool file.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    FEATURES_PATH : the path to a BED or BEDPE-like file that contains features for snipping windows.
    If BED, then the features are on-diagonal. If BEDPE, then the features
    can be off-diagonal (but not in trans or between different regions in the view).

    """

    clr = cooler.Cooler(cool_path)
    cooler_view_df = make_cooler_view(clr)

    #### Read the features:
    buf, names = sniff_for_header(features)
    if features_format.lower() == "bedpe":
        default_cols = [0, 1, 2, 3, 4, 5]
        bedpe_cols = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
        dtypes = {
            "chrom1": str,
            "start1": np.int64,
            "end1": np.int64,
            "chrom2": str,
            "start2": np.int64,
            "end2": np.int64,
        }
        if names is None:
            kwargs = dict(
                header=None,
                usecols=default_cols,
                dtype=dtypes,
                names=bedpe_cols,
            )
        else:
            kwargs = dict(header="infer", usecols=bedpe_cols)
    elif features_format.lower() == "bed":
        default_cols = [0, 1, 2]
        bed_cols = ["chrom", "start", "end"]
        dtypes = {"chrom": str, "start": np.int64, "end": np.int64}
        if names is None:
            kwargs = dict(
                header=None,
                names=bed_cols,
            )
        else:
            kwargs = dict(header="infer", usecols=bed_cols)
    else:
        raise NotImplementedError(
            "Automatic detection of features format is not implemented yet. "
            "Please provide BED or BEDPE as --features-format")

    features_df = pd.read_table(buf,
                                comment="#",
                                usecols=default_cols,
                                dtype=dtypes,
                                verbose=verbose,
                                **kwargs)

    ###### Define view
    if view is None:
        # full chromosome case
        view_df = cooler_view_df
    else:
        # Read view_df dataframe, and verify against cooler
        view_df = read_viewframe_from_file(view, clr, check_sorting=True)

    # make sure feature are compatible with the view_df
    if not bioframe.is_contained(features_df, view_df):
        raise ValueError("Features are not contained in view bounds")

    ##### Read expected, should be cis-expected:
    if expected is None:
        expected_value_col = None
    else:
        expected_path, expected_value_col = expected
        expected_value_cols = [
            expected_value_col,
        ]
        expected = read_expected_from_file(
            expected_path,
            contact_type="cis",
            expected_value_cols=expected_value_cols,
            verify_view=view_df,
            verify_cooler=clr,
        )

    ##### Create the pileup:
    stack = api.snipping.pileup(
        clr,
        features_df,
        view_df=view_df,
        expected_df=expected,
        expected_value_col=expected_value_col,
        flank=flank,
        min_diag=ignore_diags,
        clr_weight_name=clr_weight_name,
        nproc=nproc,
    )

    ##### Aggregate the signal:
    aggregate = aggregate.lower()
    if aggregate is None or aggregate == "mean" or aggregate == "none":
        agg_func = np.nanmean
    elif aggregate == "median":
        agg_func = np.nanmedian
    elif aggregate == "min":
        agg_func = np.nanmin
    elif aggregate == "max":
        agg_func = np.nanmax
    elif aggregate == "std":
        agg_func = np.nanstd
    else:
        raise ValueError(
            f"Aggregation mode {aggregate} not supported. Please use mean/median/min/max/std."
        )

    pileup = agg_func(stack, axis=2)

    ##### Store the data as NPZ file:
    if out_format.lower() == "npz":
        if store_snips:
            np.savez(out, pileup=pileup)
        else:
            np.savez(out, pileup=pileup, stack=stack)
    elif out_format.lower() == "hdf5":
        h5 = h5py.File(out, "w")
        h5.create_dataset("pileup", data=pileup)
        if store_snips:
            h5.create_dataset("stack", data=stack)