Ejemplo n.º 1
0
def do_pileup_iccf(
    clr: cooler.Cooler,
    snipping_windows: pd.DataFrame,
    proc: int = 5,
    collapse: bool = True,
) -> np.ndarray:
    """Takes a cooler file handle and snipping windows constructed
    by assignRegions and performs a pileup on all these regions
    based on the corrected HiC counts. Returns a numpy array
    that contains averages of all selected regions. The collapse
    parameter specifies whether to return
    the average window over all piles (collapse=True), or the individual
    windows (collapse=False)."""
    # get regions from snipping windows
    region_frame = get_regions_from_snipping_windows(snipping_windows)
    iccf_snipper = cooltools.snipping.CoolerSnipper(
        clr, regions=bioframe.parse_regions(region_frame))
    with multiprocess.Pool(proc) as pool:
        iccf_pile = cooltools.snipping.pileup(snipping_windows,
                                              iccf_snipper.select,
                                              iccf_snipper.snip,
                                              map=pool.map)
    if collapse:
        # calculate the average of all windows
        collapsed_pile_plus = np.nanmean(iccf_pile[:, :, :], axis=2)
        return collapsed_pile_plus
    return iccf_pile
Ejemplo n.º 2
0
def do_pileup_obs_exp(
    clr: cooler.Cooler,
    expected_df: pd.DataFrame,
    snipping_windows: pd.DataFrame,
    proc: int = 5,
    collapse: bool = True,
) -> np.ndarray:
    """Takes a cooler file handle, an expected dataframe
    constructed by getExpected, snipping windows constructed
    by assignRegions and performs a pileup on all these regions
    based on the obs/exp value. Returns a numpy array
    that contains averages of all selected regions.
    The collapse parameter specifies whether to return
    the average window over all piles (collapse=True), or the individual
    windows (collapse=False)."""
    region_frame = get_regions_from_snipping_windows(expected_df)
    oe_snipper = cooltools.snipping.ObsExpSnipper(
        clr, expected_df, regions=bioframe.parse_regions(region_frame))
    # set warnings filter to ignore RuntimeWarnings since cooltools
    # does not check whether there are inf or 0 values in
    # the expected dataframe
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", RuntimeWarning)
        with multiprocess.Pool(proc) as pool:
            # extract a matrix of obs/exp average values for each snipping_window
            oe_pile = cooltools.snipping.pileup(snipping_windows,
                                                oe_snipper.select,
                                                oe_snipper.snip,
                                                map=pool.map)
    if collapse:
        # calculate the average of all windows
        collapsed_pile = np.nanmean(oe_pile[:, :, :], axis=2)
        return collapsed_pile
    return oe_pile
Ejemplo n.º 3
0
def extract_windows_different_sizes_iccf(regions,
                                         arms,
                                         cooler_file,
                                         processes=2):
    """For extraction of a collection of regions that span genomic regions .
    regions -> data_frame with chrom, start, end (start, end in genomic coordinates)
    cooler -> opened cooler file
    arms  -> chromosomal arms
    """
    # assign arms to regions
    snipping_windows = _assign_supports(regions,
                                        bioframe.parse_regions(arms)).dropna()
    iccf_snipper = cooltools.snipping.CoolerSnipper(
        cooler_file, regions=bioframe.parse_regions(arms))
    with multiprocess.Pool(processes) as pool:
        result = flexible_pileup(snipping_windows,
                                 iccf_snipper.select,
                                 iccf_snipper.snip,
                                 mapper=pool.map)
    return result
Ejemplo n.º 4
0
def assign_regions(
    window: int,
    binsize: int,
    chroms: pd.Series,
    positions: pd.Series,
    arms: pd.DataFrame,
) -> pd.DataFrame:
    """Constructs a 2d region around a series of chromosomal location.
    Window specifies the windowsize for the constructed regions. The total region
    assigned will be pos-window until pos+window. The binsize specifies the size
    of the HiC bins. The positions which represent the center of the regions
    is givin the the chroms series and the positions series."""
    # construct windows from the passed chromosomes and positions
    snipping_windows = cooltools.snipping.make_bin_aligned_windows(
        binsize, chroms.values, positions.values, window)
    # assign chromosomal arm to each position
    snipping_windows = _assign_supports(snipping_windows,
                                        bioframe.parse_regions(arms))
    return snipping_windows
Ejemplo n.º 5
0
def compute_expected(
    cool_path,
    nproc,
    chunksize,
    output,
    hdf,
    contact_type,
    regions,
    balance,
    weight_name,
    blacklist,
    ignore_diags,
):
    """
    Calculate expected Hi-C signal either for cis or for trans regions
    of chromosomal interaction map.

    When balancing weights are not applied to the data, there is no
    masking of bad bins performed.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    """

    if blacklist is not None:
        raise NotImplementedError(
            "Custom genomic regions for masking from calculation of expected"
            "are not implemented."
        )
        # use blacklist-ing from cooler balance module
        # https://github.com/mirnylab/cooler/blob/843dadca5ef58e3b794dbaf23430082c9a634532/cooler/cli/balance.py#L175

    clr = cooler.Cooler(cool_path)
    if regions is None:
        regions = [(chrom, 0, clr.chromsizes[chrom]) for chrom in clr.chromnames]
        regions = parse_regions(regions)
        regions["name"] = clr.chromnames
    else:
        regions_buf, names = util.sniff_for_header(regions)
        regions = pd.read_csv(regions_buf, sep="\t", header=None)
        if regions.shape[1] not in (3, 4):
            raise ValueError(
                "The region file does not have three or four tab-delimited columns."
                "We expect a bed file with columns chrom, start, end, and optional name"
            )
        if regions.shape[1] == 4:
            regions = regions.rename(columns={0:"chrom",1:"start",2:"end",3:"name"})
            regions = parse_regions(regions)
        else:
            regions = regions.rename(columns={0:"chrom",1:"start",2:"end"})
            regions["name"] = list(regions.apply(lambda x: "{}:{}-{}".format(*x), axis=1))
            regions = parse_regions(regions)

    # define transofrms - balanced and raw ('count') for now
    if balance:
        weight1 = weight_name + "1"
        weight2 = weight_name + "2"
        transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}
    else:
        # no masking bad bins of any kind, when balancing is not applied
        weight_name = None
        transforms = {}

    # execution details
    if nproc > 1:
        pool = mp.Pool(nproc)
        map_ = pool.map
    else:
        map_ = map

    # using try-clause to close mp.Pool properly
    try:
        if contact_type == "cis":
            result = expected.diagsum(
                clr,
                regions,
                transforms=transforms,
                weight_name=weight_name,
                bad_bins=None,
                chunksize=chunksize,
                ignore_diags=ignore_diags,
                map=map_,
            )
        elif contact_type == "trans":
            # prepare pairwise combinations of regions for trans-expected (blocksum):
            regions_pairwise = combinations(regions.itertuples(index=False), 2)
            regions1, regions2 = zip(*regions_pairwise)
            result = expected.blocksum_asymm(
                clr,
                regions1 = pd.DataFrame(regions1),
                regions2 = pd.DataFrame(regions2),
                transforms=transforms,
                weight_name=weight_name,
                bad_bins=None,
                chunksize=chunksize,
                map=map_,
            )
    finally:
        if nproc > 1:
            pool.close()

    # calculate actual averages by dividing sum by n_valid:
    result["count.avg"] = result["count.sum"] / result["n_valid"]
    for key in transforms.keys():
        result[key + ".avg"] = result[key + ".sum"] / result["n_valid"]

    # output to file if specified:
    if output:
        result.to_csv(output, sep="\t", index=False, na_rep="nan")
    # or print into stdout otherwise:
    else:
        print(result.to_csv(sep="\t", index=False, na_rep="nan"))

    # would be nice to have some binary output to preserve precision.
    # to_hdf/read_hdf should work in this case as the file is small .
    # still debated as to how should we store it - store in cooler seems
    # to be consensus:
    if hdf:
        raise NotImplementedError("hdf output is to be implemented")
Ejemplo n.º 6
0
def make_saddle(
    getmatrix,
    binedges,
    digitized,
    contact_type,
    regions=None,
    min_diag=3,
    max_diag=-1,
    trim_outliers=False,
    verbose=False,
):
    """
    Make a matrix of average interaction probabilities between genomic bin
    pairs as a function of a specified genomic track. The provided genomic
    track must be pre-quantized as integers (i.e. digitized).

    Parameters
    ----------
    getmatrix : function
        A function returning a matrix of interaction between two chromosomes
        given their names/indicies.
    binedges : 1D array (length n + 1)
        Bin edges of the digitized signal. For `n` bins, there are `n + 1`
        edges. See :func:`digitize_track`.
    digitized : tuple of (DataFrame, str)
        BedGraph-like dataframe of digitized signal along with the name of
        the digitized value column.
    contact_type : str
        If 'cis' then only cis interactions are used to build the matrix.
        If 'trans', only trans interactions are used.
    regions : sequence of str or tuple, optional
        A list of genomic regions to use. Each can be a chromosome, a
        UCSC-style genomic region string or a tuple.
    min_diag : int
        Smallest diagonal to include in computation. Ignored with
        contact_type=trans.
    max_diag : int
        Biggest diagonal to include in computation. Ignored with
        contact_type=trans.
    trim_outliers : bool, optional
        Remove first and last row and column from the output matrix.
    verbose : bool, optional
        If True then reports progress.

    Returns
    -------
    interaction_sum : 2D array
        The matrix of summed interaction probability between two genomic bins
        given their values of the provided genomic track.
    interaction_count : 2D array
        The matrix of the number of genomic bin pairs that contributed to the
        corresponding pixel of ``interaction_sum``.

    """
    digitized_df, name = digitized
    digitized_df = digitized_df[["chrom", "start", "end", name]]

    if regions is None:
        regions = [(chrom, df.start.min(), df.end.max())
                   for chrom, df in digitized_df.groupby("chrom")]

    regions = bioframe.parse_regions(regions)

    digitized_tracks = {}
    for reg in regions.values:
        track = bioframe.select(digitized_df, reg)
        digitized_tracks[reg[3]] = track[name]  # 3 = name

    if contact_type == "cis":
        supports = list(zip(regions["name"], regions["name"]))
    elif contact_type == "trans":
        supports = list(combinations(regions["name"], 2))
    else:
        raise ValueError("The allowed values for the contact_type "
                         "argument are 'cis' or 'trans'.")

    # n_bins here includes 2 open bins
    # for values <lo and >hi.
    n_bins = len(binedges) + 1
    interaction_sum = np.zeros((n_bins, n_bins))
    interaction_count = np.zeros((n_bins, n_bins))

    for reg1, reg2 in supports:
        _accumulate(
            interaction_sum,
            interaction_count,
            getmatrix,
            digitized_tracks,
            reg1,
            reg2,
            min_diag,
            max_diag,
            verbose,
        )

    interaction_sum += interaction_sum.T
    interaction_count += interaction_count.T

    if trim_outliers:
        interaction_sum = interaction_sum[1:-1, 1:-1]
        interaction_count = interaction_count[1:-1, 1:-1]

    return interaction_sum, interaction_count
Ejemplo n.º 7
0
def blocksum_asymm(
    clr,
    regions1,
    regions2,
    transforms={},
    weight_name="weight",
    bad_bins=None,
    chunksize=1000000,
    map=map,
):
    """
    Summary statistics on rectangular blocks of genomic regions.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    regions1 : sequence of genomic range tuples
        "left"-side support regions for diagonal summation
    regions2 : sequence of genomic range tuples
        "right"-side support regions for diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    weight_name : str
        name of the balancing weight vector used to count
        "bad"(masked) pixels per block.
        Use `None` to avoid masking "bad" pixels.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Combines with the list of bad bins from balacning
        weight.
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    DataFrame with entries for each blocks: region1, region2, n_valid, count.sum

    """

    regions1 = bioframe.parse_regions(regions1, clr.chromsizes)
    regions2 = bioframe.parse_regions(regions2, clr.chromsizes)

    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ["count"] + list(transforms.keys())

    # similar with diagonal summations, pre-generate a block_table listing
    # all of the rectangular blocks and "n_valid" number of pixels per each block:
    records = make_block_table(
        clr, regions1, regions2, weight_name=weight_name, bad_bins=bad_bins
    )

    # combine masking with existing transforms and add a "count" transform:
    if bad_bins is not None:
        # turn bad_bins into a mask of size clr.bins:
        mask_size = len(clr.bins())
        bad_bins_mask = np.ones(mask_size, dtype=int)
        bad_bins_mask[bad_bins] = 0
        #
        masked_transforms = {}
        bin1 = "bin1_id"
        bin2 = "bin2_id"
        for field in fields:
            if field in transforms:
                # combine masking and transform, minding the scope:
                t = transforms[field]
                masked_transforms[field] = (
                    lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]]
                )
            else:
                # presumably field == "count", mind the scope as well:
                masked_transforms[field] = (
                    lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]]
                )
        # substitute transforms to the masked_transforms:
        transforms = masked_transforms

    job = partial(
        _blocksum_asymm, clr, fields, transforms, regions1.values, regions2.values
    )
    results = map(job, spans)
    for result in results:
        for i, agg in result.items():
            for field in fields:
                agg_name = "{}.sum".format(field)
                s = agg[field].item()
                if not np.isnan(s):
                    n1 = regions1.loc[i, "name"]
                    n2 = regions2.loc[i, "name"]
                    records[n1, n2][agg_name] += s

    # returning a dataframe for API consistency:
    return pd.DataFrame(
        [{"region1": n1, "region2": n2, **rec} for (n1, n2), rec in records.items()],
        columns=["region1", "region2", "n_valid", "count.sum"]
        + [k + ".sum" for k in transforms.keys()],
    )
Ejemplo n.º 8
0
def diagsum_asymm(
    clr,
    regions1,
    regions2,
    transforms={},
    weight_name="weight",
    bad_bins=None,
    chunksize=10000000,
    map=map,
):
    """

    Diagonal summary statistics.

    Matchings elements of `regions1` and  `regions2` define
    asymmetric rectangular blocks for calculating diagonal
    summary statistics.
    Only intra-chromosomal blocks are supported.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    regions1 : sequence of genomic range tuples
        "left"-side support regions for diagonal summation
    regions2 : sequence of genomic range tuples
        "right"-side support regions for diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    weight_name : str
        name of the balancing weight vector used to count
        "bad"(masked) pixels per diagonal.
        Use `None` to avoid masking "bad" pixels.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Combines with the list of bad bins from balacning
        weight.
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    DataFrame with summary statistic of every diagonal of every block:
    region1, region2, diag, n_valid, count.sum

    """
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ["count"] + list(transforms.keys())
    regions1 = bioframe.parse_regions(regions1, clr.chromsizes)
    regions2 = bioframe.parse_regions(regions2, clr.chromsizes)

    dtables = make_diag_tables(
        clr, regions1, regions2, weight_name=weight_name, bad_bins=bad_bins
    )

    # combine masking with existing transforms and add a "count" transform:
    if bad_bins is not None:
        # turn bad_bins into a mask of size clr.bins:
        mask_size = len(clr.bins())
        bad_bins_mask = np.ones(mask_size, dtype=int)
        bad_bins_mask[bad_bins] = 0
        #
        masked_transforms = {}
        bin1 = "bin1_id"
        bin2 = "bin2_id"
        for field in fields:
            if field in transforms:
                # combine masking and transform, minding the scope:
                t = transforms[field]
                masked_transforms[field] = (
                    lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]]
                )
            else:
                # presumably field == "count", mind the scope as well:
                masked_transforms[field] = (
                    lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]]
                )
        # substitute transforms to the masked_transforms:
        transforms = masked_transforms

    for dt in dtables.values():
        for field in fields:
            agg_name = "{}.sum".format(field)
            dt[agg_name] = 0

    job = partial(
        _diagsum_asymm, clr, fields, transforms, regions1.values, regions2.values
    )
    results = map(job, spans)
    for result in results:
        for i, agg in result.items():
            region1 = regions1.loc[i, "name"]
            region2 = regions2.loc[i, "name"]
            for field in fields:
                agg_name = "{}.sum".format(field)
                dtables[region1, region2][agg_name] = dtables[region1, region2][
                    agg_name
                ].add(agg[field], fill_value=0)

    # returning a dataframe for API consistency:
    result = []
    for (i, j), dtable in dtables.items():
        dtable = dtable.reset_index()
        dtable.insert(0, "region1", i)
        dtable.insert(1, "region2", j)
        result.append(dtable)
    return pd.concat(result).reset_index(drop=True)
Ejemplo n.º 9
0
def diagsum(
    clr,
    regions,
    transforms={},
    weight_name="weight",
    bad_bins=None,
    chunksize=10000000,
    ignore_diags=2,
    map=map,
):
    """

    Intra-chromosomal diagonal summary statistics.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    regions : sequence of genomic range tuples
        Support regions for intra-chromosomal diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    weight_name : str
        name of the balancing weight vector used to count
        "bad"(masked) pixels per diagonal.
        Use `None` to avoid masking "bad" pixels.
    bad_bins : array-like
        a list of bins to ignore per support region.
        Combines with the list of bad bins from balacning
        weight.
    chunksize : int, optional
        Size of pixel table chunks to process
    ignore_diags : int, optional
        Number of intial diagonals to exclude from statistics
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    Dataframe of diagonal statistics for all regions

    """
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = ["count"] + list(transforms.keys())

    regions = bioframe.parse_regions(regions, clr.chromsizes)

    dtables = make_diag_tables(clr, regions, weight_name=weight_name, bad_bins=bad_bins)

    # combine masking with existing transforms and add a "count" transform:
    if bad_bins is not None:
        # turn bad_bins into a mask of size clr.bins:
        mask_size = len(clr.bins())
        bad_bins_mask = np.ones(mask_size, dtype=int)
        bad_bins_mask[bad_bins] = 0
        #
        masked_transforms = {}
        bin1 = "bin1_id"
        bin2 = "bin2_id"
        for field in fields:
            if field in transforms:
                # combine masking and transform, minding the scope:
                t = transforms[field]
                masked_transforms[field] = (
                    lambda p, t=t, m=bad_bins_mask: t(p) * m[p[bin1]] * m[p[bin2]]
                )
            else:
                # presumably field == "count", mind the scope as well:
                masked_transforms[field] = (
                    lambda p, f=field, m=bad_bins_mask: p[f] * m[p[bin1]] * m[p[bin2]]
                )
        # substitute transforms to the masked_transforms:
        transforms = masked_transforms

    for dt in dtables.values():
        for field in fields:
            agg_name = "{}.sum".format(field)
            dt[agg_name] = 0

    job = partial(_diagsum_symm, clr, fields, transforms, regions.values)
    results = map(job, spans)
    for result in results:
        for i, agg in result.items():
            region = regions.loc[i, "name"]
            for field in fields:
                agg_name = "{}.sum".format(field)
                dtables[region][agg_name] = dtables[region][agg_name].add(
                    agg[field], fill_value=0
                )

    if ignore_diags:
        for dt in dtables.values():
            for field in fields:
                agg_name = "{}.sum".format(field)
                j = dt.columns.get_loc(agg_name)
                dt.iloc[:ignore_diags, j] = np.nan

    # returning dataframe for API consistency
    result = []
    for i, dtable in dtables.items():
        dtable = dtable.reset_index()
        dtable.insert(0, "region", i)
        result.append(dtable)
    return pd.concat(result).reset_index(drop=True)
Ejemplo n.º 10
0
def make_block_table(clr, regions1, regions2, weight_name="weight", bad_bins=None):
    """
    Creates a table that characterizes a set of rectangular genomic blocks
    formed by combining regions from regions1 and regions2.
    For every block calculate its "area" in pixels ("n_total"), and calculate
    number of "valid" pixels in each block ("n_valid").
    "Valid" pixels exclude "bad" pixels, which in turn inferred from the balancing
    weight column `weight_name` or provided directly in the form of an array of
    `bad_bins`.

    Setting `weight_name` and `bad_bins` to `None` yields 0 "bad" pixels per
    block.

    Parameters
    ----------
    clr : cooler.Cooler
        Input cooler
    regions1 : iterable
        a collection of genomic regions
    regions2 : iterable
        a collection of genomic regions
    weight_name : str
        name of the weight vector in the "bins" table,
        if weight_name is None returns 0 for each block.
        Balancing weight are used to infer bad bins.
    bad_bins : array-like
        a list of bins to ignore. Indexes of bins must
        be absolute, as in clr.bins()[:], as opposed to
        being offset by chromosome start.
        "bad_bins" will be combined with the bad bins
        masked by balancing if there are any.

    Returns
    -------
    block_table : dict
        dictionary for blocks that are 0-indexed
    """
    if bad_bins is None:
        bad_bins = np.asarray([]).astype(int)
    else:
        bad_bins = np.asarray(bad_bins).astype(int)

    regions1 = bioframe.parse_regions(regions1, clr.chromsizes).values
    regions2 = bioframe.parse_regions(regions2, clr.chromsizes).values

    # should we check for nestedness here, or that each region1 is < region2 ?

    block_table = {}
    for r1, r2 in zip(regions1, regions2):
        chrom1, start1, end1, name1 = r1
        chrom2, start2, end2, name2 = r2
        # translate regions into relative bin id-s:
        lo1, hi1 = clr.extent((chrom1, start1, end1))
        lo2, hi2 = clr.extent((chrom2, start2, end2))
        # width and height of a block:
        x = hi1 - lo1
        y = hi2 - lo2
        # get "regional" bad_bins for each of the regions
        bx = bad_bins[(bad_bins >= lo1) & (bad_bins < hi1)] - lo1
        by = bad_bins[(bad_bins >= lo2) & (bad_bins < hi2)] - lo2

        # now we need to combine it with the balancing weights
        if weight_name is None:
            bad_bins_x = len(bx)
            bad_bins_y = len(by)
        elif isinstance(weight_name, str):
            if weight_name not in clr.bins().columns:
                raise KeyError("Balancing weight {weight_name} not found!")
            else:
                # extract "bad" bins filtered by balancing:
                cb_bins_x = clr.bins()[weight_name][lo1:hi1].isnull().values
                cb_bins_y = clr.bins()[weight_name][lo2:hi2].isnull().values
                # combine with "bad_bins" using assignment:
                cb_bins_x[bx] = True
                cb_bins_y[by] = True
                # count and yield final list of bad bins:
                bad_bins_x = np.count_nonzero(cb_bins_x)
                bad_bins_y = np.count_nonzero(cb_bins_y)
        else:
            raise ValueError("`weight_name` can be `str` or `None`")

        # calculate total and bad pixels per block:
        n_tot = count_all_pixels_per_block(x, y)
        n_bad = count_bad_pixels_per_block(x, y, bad_bins_x, bad_bins_y)

        # fill in "block_table" with number of valid pixels:
        block_table[name1, name2] = defaultdict(int)
        block_table[name1, name2]["n_valid"] = n_tot - n_bad

    return block_table
Ejemplo n.º 11
0
def make_diag_tables(clr, regions, regions2=None, weight_name="weight", bad_bins=None):
    """
    For every support region infer diagonals that intersect this region
    and calculate the size of these intersections in pixels, both "total" and
    "n_valid", where "n_valid" does not include "bad" bins into counting.

    "Bad" pixels are inferred from the balancing weight column `weight_name` or
    provided directly in the form of an array `bad_bins`.

    Setting `weight_name` and `bad_bins` to `None` yields 0 "bad" pixels per
    diagonal per support region.

    When `regions2` are provided, all intersecting diagonals are reported for
    each rectangular and asymmetric block defined by combinations of matching
    elements of `regions` and `regions2`.
    Otherwise only `regions`-based symmetric square blocks are considered.
    Only intra-chromosomal regions are supported.

    Parameters
    ----------
    clr : cooler.Cooler
        Input cooler
    regions : list
        a list of genomic support regions
    regions2 : list
        a list of genomic support regions for asymmetric regions
    weight_name : str
        name of the weight vector in the "bins" table,
        if weight_name is None returns 0 for each block.
        Balancing weight are used to infer bad bins.
    bad_bins : array-like
        a list of bins to ignore. Indexes of bins must
        be absolute, as in clr.bins()[:], as opposed to
        being offset by chromosome start.
        "bad_bins" will be combined with the bad bins
        masked by balancing if there are any.

    Returns
    -------
    diag_tables : dict
        dictionary with DataFrames of relevant diagonals for every support.
    """

    regions = bioframe.parse_regions(regions, clr.chromsizes).values
    if regions2 is not None:
        regions2 = bioframe.parse_regions(regions2, clr.chromsizes).values

    bins = clr.bins()[:]
    if weight_name is None:
        # ignore bad bins
        sizes = dict(bins.groupby("chrom").size())
        bad_bin_dict = {
            chrom: np.zeros(sizes[chrom], dtype=bool) for chrom in sizes.keys()
        }
    elif isinstance(weight_name, str):
        # using balacning weight to infer bad bins
        if weight_name not in clr.bins().columns:
            raise KeyError("Balancing weight {weight_name} not found!")
        groups = dict(iter(bins.groupby("chrom")[weight_name]))
        bad_bin_dict = {
            chrom: np.array(groups[chrom].isnull()) for chrom in groups.keys()
        }
    else:
        raise ValueError("`weight_name` can be `str` or `None`")

    # combine custom "bad_bins" with "bad_bin_dict":
    if bad_bins is not None:
        # check if "bad_bins" are legit:
        try:
            bad_bins_chrom = bins.iloc[bad_bins].reset_index(drop=False)
        except IndexError:
            raise ValueError("Provided `bad_bins` are incorrect or out-of-bound")
        # group them by observed chromosomes only
        bad_bins_grp = bad_bins_chrom[["index", "chrom"]].groupby(
            "chrom", observed=True
        )
        # update "bad_bin_dict" with "bad_bins" for each chrom:
        for chrom, bin_ids in bad_bins_grp["index"]:
            co = clr.offset(chrom)
            # adjust by chromosome offset
            bad_bin_dict[chrom][bin_ids.values - co] = True

    diag_tables = {}
    for i in range(len(regions)):
        chrom, start1, end1, name1 = regions[i]
        if regions2 is not None:
            chrom2, start2, end2, name2 = regions2[i]
            # cis-only for now:
            assert chrom2 == chrom
        else:
            start2, end2 = start1, end1

        # translate regions into relative bin id-s:
        lo1, hi1 = clr.extent((chrom, start1, end1))
        lo2, hi2 = clr.extent((chrom, start2, end2))
        co = clr.offset(chrom)
        lo1 -= co
        lo2 -= co
        hi1 -= co
        hi2 -= co

        bad_mask = bad_bin_dict[chrom]
        newname = name1
        if regions2 is not None:
            newname = (name1, name2)
        diag_tables[newname] = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2])

    return diag_tables
Ejemplo n.º 12
0
def compute_saddle(
    cool_path,
    track_path,
    expected_path,
    contact_type,
    min_dist,
    max_dist,
    n_bins,
    quantiles,
    range_,
    qrange,
    weight_name,
    strength,
    regions,
    out_prefix,
    fig,
    scale,
    cmap,
    vmin,
    vmax,
    hist_color,
    verbose,
):
    """
    Calculate saddle statistics and generate saddle plots for an arbitrary
    signal track on the genomic bins of a contact matrix.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    TRACK_PATH : The path to bedGraph-like file with a binned compartment track
    (eigenvector), including a header. Use the '::' syntax to specify a column
    name.

    EXPECTED_PATH : The paths to a tsv-like file with expected signal,
    including a header. Use the '::' syntax to specify a column name.

    Analysis will be performed for chromosomes referred to in TRACK_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH and EXPECTED_PATH.

    COOL_PATH, TRACK_PATH and EXPECTED_PATH must be binned at the same
    resolution (expect for  EXPECTED_PATH in case of trans contact type).

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'chrom', 'diag', 'n_valid', value_name and the following columns for trans
    contacts: 'chrom1', 'chrom2', 'n_valid', value_name value_name is controlled
    using options. Header must be present in a file.

    """
    clr = cooler.Cooler(cool_path)
    expected_path, expected_name = expected_path
    track_path, track_name = track_path

    if regions is None:
        regions = [(chrom, 0, clr.chromsizes[chrom])
                   for chrom in clr.chromnames]
        regions = parse_regions(regions)
        regions["name"] = clr.chromnames
    else:
        regions_buf, names = util.sniff_for_header(regions)
        regions = pd.read_csv(regions_buf, sep="\t", header=None)
        if regions.shape[1] not in (3, 4):
            raise ValueError(
                "The region file does not have three or four tab-delimited columns."
                "We expect a bed file with columns chrom, start, end, and optional name"
            )
        if regions.shape[1] == 4:
            regions = regions.rename(columns={
                0: "chrom",
                1: "start",
                2: "end",
                3: "name"
            })
            regions = parse_regions(regions)
        else:
            regions = regions.rename(columns={
                0: "chrom",
                1: "start",
                2: "end"
            })
            regions["name"] = list(
                regions.apply(lambda x: "{}:{}-{}".format(*x), axis=1))
            regions = parse_regions(regions)

    if vmin <= 0 or vmax <= 0:
        raise ValueError(
            "vmin and vmax values are in original units irrespective "
            "of used scale, and therefore should be positive")

    # read expected and make preparations for validation,
    # it's contact_type dependent:
    if contact_type == "cis":
        # that's what we expect as column names:
        expected_columns = ["region", "diag", "n_valid", expected_name]
        # what would become a MultiIndex:
        expected_index = ["region", "diag"]
        # expected dtype as a rudimentary form of validation:
        expected_dtype = {
            "region": np.str,
            "diag": np.int64,
            "n_valid": np.int64,
            expected_name: np.float64,
        }
        # # unique list of chroms mentioned in expected_path:
        # get_exp_chroms = lambda df: df.index.get_level_values("region").unique()
        # # compute # of bins by comparing matching indexes:
        # get_exp_bins = lambda df, ref_chroms, _: (
        #     df.index.get_level_values("chrom").isin(ref_chroms).sum()
        # )
    elif contact_type == "trans":
        # that's what we expect as column names:
        expected_columns = ["region1", "region2", "n_valid", expected_name]
        # what would become a MultiIndex:
        expected_index = ["region1", "region2"]
        # expected dtype as a rudimentary form of validation:
        expected_dtype = {
            "region1": np.str,
            "region2": np.str,
            "n_valid": np.int64,
            expected_name: np.float64,
        }
        # # unique list of chroms mentioned in expected_path:
        # get_exp_chroms = lambda df: np.union1d(
        #     df.index.get_level_values("region1").unique(),
        #     df.index.get_level_values("region2").unique(),
        # )
        # # no way to get bins from trans-expected, so just get the number:
        # get_exp_bins = lambda _1, _2, correct_bins: correct_bins
    else:
        raise ValueError(
            "Incorrect contact_type: {}, ".format(contact_type),
            "Should have been caught by click.",
        )

    if min_dist < 0:
        min_diag = 3
    else:
        min_diag = int(np.ceil(min_dist / clr.binsize))

    if max_dist >= 0:
        max_diag = int(np.floor(max_dist / clr.binsize))
    else:
        max_diag = -1

    # use 'usecols' as a rudimentary form of validation,
    # and dtype. Keep 'comment' and 'verbose' - explicit,
    # as we may use them later:
    expected = pd.read_table(
        expected_path,
        usecols=expected_columns,
        index_col=expected_index,
        dtype=expected_dtype,
        comment=None,
        verbose=verbose,
    )

    # read bedGraph-file :
    track_columns = ["chrom", "start", "end", track_name]
    # specify dtype as a rudimentary form of validation:
    track_dtype = {
        "chrom": np.str,
        "start": np.int64,
        "end": np.int64,
        track_name: np.float64,
    }
    track = pd.read_table(
        track_path,
        usecols=track_columns,
        dtype=track_dtype,
        comment=None,
        verbose=verbose,
    )

    #############################################
    # CROSS-VALIDATE COOLER, EXPECTED AND TRACK:
    #############################################
    # TRACK vs COOLER:
    track_chroms = track["chrom"].unique()
    # We might want to try this eventually:
    # https://github.com/TMiguelT/PandasSchema
    # do simple column-name validation for now:
    if not set(track_chroms).issubset(clr.chromnames):
        raise ValueError(
            "Chromosomes in {} must be subset of ".format(track_path) +
            "chromosomes in cooler {}".format(cool_path))
    # check number of bins:
    track_bins = len(track)
    cool_bins = clr.bins()[:]["chrom"].isin(track_chroms).sum()
    if not (track_bins == cool_bins):
        raise ValueError(
            "Number of bins is not matching: ",
            "{} in {}, and {} in {} for chromosomes {}".format(
                track_bins, track_path, cool_bins, cool_path, track_chroms),
        )
    # # EXPECTED vs TRACK:
    # # validate expected a bit as well:
    # expected_chroms = get_exp_chroms(expected)
    # # do simple column-name validation for now:
    # if not set(track_chroms).issubset(expected_chroms):
    #     raise ValueError(
    #         "Chromosomes in {} must be subset of ".format(track_path)
    #         + "chromosomes in expected {}".format(expected_path)
    #     )
    # # and again bins are supposed to match up:
    # # only for cis though ...
    # expected_bins = get_exp_bins(expected, track_chroms, track_bins)
    # if not (track_bins == expected_bins):
    #     raise ValueError(
    #         "Number of bins is not matching: ",
    #         "{} in {}, and {} in {} for chromosomes {}".format(
    #             track_bins, track_path, expected_bins, expected_path, track_chroms
    #         ),
    #     )
    #############################################
    # CROSS-VALIDATION IS COMPLETE.
    #############################################

    track = saddle.mask_bad_bins((track, track_name),
                                 (clr.bins()[:], weight_name))

    if contact_type == "cis":
        getmatrix = saddle.make_cis_obsexp_fetcher(clr,
                                                   (expected, expected_name),
                                                   weight_name=weight_name)
    elif contact_type == "trans":
        getmatrix = saddle.make_trans_obsexp_fetcher(clr,
                                                     (expected, expected_name),
                                                     weight_name=weight_name)

    if quantiles:
        if len(range_):
            qlo, qhi = saddle.ecdf(track[track_name], range_)
        elif len(qrange):
            qlo, qhi = qrange
        else:
            qlo, qhi = 0.0, 1.0
        q_edges = np.linspace(qlo, qhi, n_bins)
        binedges = saddle.quantile(track[track_name], q_edges)
    else:
        if len(range_):
            lo, hi = range_
        elif len(qrange):
            lo, hi = saddle.quantile(track[track_name], qrange)
        else:
            lo, hi = track[track_name].min(), track[track_name].max()
        binedges = np.linspace(lo, hi, n_bins)

    digitized, hist = saddle.digitize_track(binedges,
                                            track=(track, track_name),
                                            regions=track_chroms)

    S, C = saddle.make_saddle(
        getmatrix,
        binedges,
        (digitized, track_name + ".d"),
        contact_type=contact_type,
        regions=regions,
        min_diag=min_diag,
        max_diag=max_diag,
    )

    saddledata = S / C

    to_save = dict(saddledata=saddledata, binedges=binedges, hist=hist)

    if strength:
        ratios = saddle.saddle_strength(S, C)
        ratios = ratios[1:-1]  # drop outlier bins
        to_save["saddle_strength"] = ratios

    # Save data
    np.savez(out_prefix + ".saddledump", **to_save)  # .npz auto-added
    digitized.to_csv(out_prefix + ".digitized.tsv", sep="\t", index=False)

    # Generate figure
    if len(fig):
        try:
            import matplotlib as mpl

            mpl.use("Agg")  # savefig only for now:
            import matplotlib.pyplot as plt
        except ImportError:
            print("Install matplotlib to use ", file=sys.stderr)
            sys.exit(1)

        if hist_color is None:
            color = (
                0.41568627450980394,
                0.8,
                0.39215686274509803,
            )  # sns.color_palette('muted')[2]
        else:
            color = mpl.colors.colorConverter.to_rgb(hist_color)
        title = op.basename(cool_path) + " ({})".format(contact_type)
        if quantiles:
            edges = q_edges
            track_label = track_name + " quantiles"
        else:
            edges = binedges
            track_label = track_name
        clabel = "(contact frequency / expected)"

        saddle.saddleplot(edges,
                          hist,
                          saddledata,
                          scale=scale,
                          vmin=vmin,
                          vmax=vmax,
                          color=color,
                          title=title,
                          xlabel=track_label,
                          ylabel=track_label,
                          clabel=clabel,
                          cmap=cmap)

        for ext in fig:
            plt.savefig(out_prefix + "." + ext, bbox_inches="tight")
Ejemplo n.º 13
0
def cooler_cis_eig(
    clr,
    bins,
    regions=None,
    n_eigs=3,
    phasing_track_col="GC",
    balance="weight",
    ignore_diags=None,
    bad_bins=None,
    clip_percentile=99.9,
    sort_metric=None,
    map=map,
):
    """
    Compute compartment eigenvector for a given cooler `clr` in a number of
    symmetric intra chromosomal regions (cis-regions), or for each chromosome.
    Note that the amplitude of compartment eigenvectors is weighted by their
    corresponding eigenvalue
    Parameters
    ----------
    clr : cooler
        cooler object to fetch data from
    bins : DataFrame
        table of bins derived from clr with phasing track added
    regions : iterable or DataFrame, optional
        if provided, eigenvectors are calculated for the regions only,
        otherwise chromosome-wide eigenvectors are computed, for chromosomes
        specified in bins.
    n_eigs : int
        number of eigenvectors to compute
    phasing_track_col : str, optional
        name of the columns in `bins` table, if provided, eigenvectors are
        flipped to achieve a positive correlation with `bins[phasing_track_col]`.
    balance : str
        name of the column with balancing weights to be used.
    ignore_diags : int, optional
        the number of diagonals to ignore. Derived from cooler metadata
        if not specified.
    bad_bins : array-like
        a list of bins to ignore. Indexes of bins must be absolute,
        as in clr.bins()[:], as opposed to being offset by chromosome start.
        `bad_bins` will be combined with the bad bins masked by balancing.
    clip_percentile : float
        if >0 and <100, clip pixels with diagonal-normalized values
        higher than the specified percentile of matrix-wide values.
    sort_metric : str
        If provided, re-sort `eigenvecs` and `eigvals` in the order of
        decreasing correlation between phasing_track and eigenvector, using the
        specified measure of correlation. Possible values:
        'pearsonr' - sort by decreasing Pearson correlation.
        'var_explained' - sort by decreasing absolute amount of variation in
        `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec))
        'MAD_explained' - sort by decreasing absolute amount of Median Absolute
        Deviation from the median of `eigvecs` explained by `phasing_track`
        (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)).
        'spearmanr' - sort by decreasing Spearman correlation.
        This option is designed to report the most "biologically" informative
        eigenvectors first, and prevent eigenvector swapping caused by
        translocations. In reality, however, sometimes it shows poor
        performance and may lead to reporting of non-informative eigenvectors.
        Off by default.
    map : callable, optional
        Map functor implementation.
    Returns
    -------
    eigvals, eigvec_table -> DataFrames with eigenvalues for each region and
    a table of eigenvectors filled in the `bins` table.
    .. note:: ALWAYS check your EVs by eye. The first one occasionally does
              not reflect the compartment structure, but instead describes
              chromosomal arms or translocation blowouts. Possible mitigations:
              employ `regions` (e.g. arms) to avoid issues with chromosomal arms,
              use `bad_bins` to ignore small transolcations.
    """

    # get chromosomes from bins, if regions not specified:
    if regions is None:
        regions = list(
            bins["chrom"].unique())  # parse_regions fill in the rest

    # make sure phasing_track_col is in bins, if phasing is requested
    if phasing_track_col and (phasing_track_col not in bins):
        raise ValueError(f'No column "{phasing_track_col}" in the bin table')

    # regions to dataframe
    regions = bioframe.parse_regions(regions, clr.chromsizes)

    # ignore diags as in cooler inless specified
    ignore_diags = (clr._load_attrs("bins/weight").get("ignore_diags", 2)
                    if ignore_diags is None else ignore_diags)

    # prepare output table for eigen vectors
    eigvec_table = bins.copy()
    eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
    for ev_col in eigvec_columns:
        eigvec_table[ev_col] = np.nan

    # prepare output table for eigenvalues
    eigvals_table = regions.copy()
    eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
    for eval_col in eigval_columns:
        eigvals_table[eval_col] = np.nan

    def _each(region):
        """
        perform eigen decomposition for a given region
        assuming safety checks are done outside of this
        function.
        Parameters
        ----------
        region: tuple-like
            tuple of the form (chroms,start,end,*)
        Returns
        -------
        _region, eigvals, eigvecs -> ndarrays
            array of eigenvalues and an array eigenvectors
        """
        _region = region[:3]  # take only (chrom, start, end)
        A = clr.matrix(balance=balance).fetch(_region)

        # filter bad_bins relevant for the _region from A
        if bad_bins is not None:
            # filter bad_bins for the _region and turn relative:
            lo, hi = clr.extent(_region)
            bad_bins_region = bad_bins[(bad_bins >= lo) & (bad_bins < hi)]
            bad_bins_region -= lo
            if len(bad_bins_region) > 0:
                # apply bad bins to symmetric matrix A:
                A[:, bad_bins_region] = np.nan
                A[bad_bins_region, :] = np.nan

        # extract phasing track relevant for the _region
        phasing_track = (bioframe.select(bins,
                                         _region)[phasing_track_col].values
                         if phasing_track_col else None)

        eigvals, eigvecs = cis_eig(
            A,
            n_eigs=n_eigs,
            ignore_diags=ignore_diags,
            phasing_track=phasing_track,
            clip_percentile=clip_percentile,
            sort_metric=sort_metric,
        )

        return _region, eigvals, eigvecs

    # eigendecompose matrix per region (can be multiprocessed)
    # output assumes that the order of results matches regions
    results = map(_each, regions.values)

    # go through eigendecomposition results and fill in
    # output table eigvec_table and eigvals_table
    for _region, _eigvals, _eigvecs in results:
        idx = bioframe.select(eigvec_table, _region).index
        eigvec_table.at[idx, eigvec_columns] = _eigvecs.T
        idx = bioframe.select(eigvals_table, _region).index
        eigvals_table.at[idx, eigval_columns] = _eigvals

    return eigvals_table, eigvec_table
Ejemplo n.º 14
0
def call_compartments(
    cool_path,
    reference_track,
    regions,
    contact_type,
    n_eigs,
    verbose,
    out_prefix,
    bigwig,
):
    """
    Perform eigen value decomposition on a cooler matrix to calculate
    compartment signal by finding the eigenvector that correlates best with the
    phasing track.


    COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    TRACK_PATH : the path to a BedGraph-like file that stores phasing track as
    track-name named column.

    BedGraph-like format assumes tab-separated columns chrom, start, stop and
    track-name.

    """
    clr = cooler.Cooler(cool_path)

    if reference_track is not None:

        # TODO: This all needs to be refactored into a more generic tabular file parser
        # Needs to handle stdin case too.
        track_path, col = reference_track
        buf, names = sniff_for_header(track_path)

        if names is None:
            if not isinstance(col, int):
                raise click.BadParameter(
                    "No header found. "
                    'Cannot find "{}" column without a header.'.format(col))

            track_name = "ref"
            kwargs = dict(
                header=None,
                usecols=[0, 1, 2, col],
                names=["chrom", "start", "end", track_name],
            )
        else:
            if isinstance(col, int):
                try:
                    col = names[col]
                except IndexError:
                    raise click.BadParameter(
                        'Column #{} not compatible with header "{}".'.format(
                            col, ",".join(names)))
            else:
                if col not in names:
                    raise click.BadParameter(
                        'Column "{}" not found in header "{}"'.format(
                            col, ",".join(names)))

            track_name = col
            kwargs = dict(header="infer",
                          usecols=["chrom", "start", "end", track_name])

        track_df = pd.read_table(buf,
                                 dtype={
                                     "chrom": str,
                                     "start": np.int64,
                                     "end": np.int64,
                                     track_name: np.float64,
                                 },
                                 comment="#",
                                 verbose=verbose,
                                 **kwargs)

        # we need to merge phasing track DataFrame with the cooler bins to get
        # a DataFrame with phasing info aligned and validated against bins inside of
        # the cooler file.
        track = pd.merge(left=clr.bins()[:],
                         right=track_df,
                         how="left",
                         on=["chrom", "start", "end"])

        # sanity check would be to check if len(bins) becomes > than nbins ...
        # that would imply there was something in the track_df that didn't match
        # ["chrom", "start", "end"] - keys from the c.bins()[:] .
        if len(track) > len(clr.bins()):
            ValueError(
                "There is something in the {} that ".format(track_path) +
                "couldn't be merged with cooler-bins {}".format(cool_path))
    else:
        # use entire bin-table from cooler, when reference-track is not provided:
        track = clr.bins()[["chrom", "start", "end"]][:]
        track_name = None

    # define regions for cis compartment-calling
    # use input "regions" BED file or all chromosomes mentioned in "track":
    if regions is None:
        # use full chromosomes referred to in the track :
        track_chroms = track["chrom"].unique()
        cis_regions_table = bioframe.parse_regions(track_chroms,
                                                   clr.chromsizes)
        cis_regions_table["name"] = cis_regions_table["chrom"]
    else:
        if contact_type == "trans":
            raise NotImplementedError(
                "Regions not yet supported with trans contact type")
        # Flexible reading of the regions table:
        regions_buf, names = sniff_for_header(regions)
        cis_regions_table = pd.read_csv(regions_buf, sep="\t", header=None)
        if cis_regions_table.shape[1] not in (3, 4):
            raise ValueError(
                "The region file does not have three or four tab-delimited columns."
                "We expect a bed file with columns chrom, start, end, and optional name"
            )
        if cis_regions_table.shape[1] == 4:
            cis_regions_table = cis_regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end",
                3: "name"
            })
            cis_regions_table = bioframe.parse_regions(cis_regions_table)
        else:
            cis_regions_table = cis_regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end"
            })
            cis_regions_table = bioframe.parse_regions(cis_regions_table)
        # make sure custom regions are compatible with the track:
        track_chroms = track["chrom"].unique()
        cis_regions_table = cis_regions_table[cis_regions_table["chrom"].isin(
            track_chroms)].reset_index(drop=True)

    # it's contact_type dependent:
    if contact_type == "cis":
        eigvals, eigvec_table = eigdecomp.cooler_cis_eig(
            clr=clr,
            bins=track,
            regions=cis_regions_table,
            n_eigs=n_eigs,
            phasing_track_col=track_name,
            clip_percentile=99.9,
            sort_metric=None,
        )
    elif contact_type == "trans":
        eigvals, eigvec_table = eigdecomp.cooler_trans_eig(
            clr=clr,
            bins=track,
            n_eigs=n_eigs,
            partition=None,
            phasing_track_col=track_name,
            sort_metric=None,
        )

    # Output
    eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt",
                   sep="\t",
                   index=False)
    eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv",
                        sep="\t",
                        index=False)
    if bigwig:
        bioframe.to_bigwig(
            eigvec_table,
            clr.chromsizes,
            out_prefix + "." + contact_type + ".bw",
            value_field="E1",
        )
Ejemplo n.º 15
0
def call_dots(
    cool_path,
    expected_path,
    regions,
    expected_name,
    weight_name,
    nproc,
    max_loci_separation,
    max_nans_tolerated,
    tile_size,
    kernel_width,
    kernel_peak,
    num_lambda_chunks,
    fdr,
    dots_clustering_radius,
    verbose,
    out_prefix,
):
    """
    Call dots on a Hi-C heatmap that are not larger than max_loci_separation.

    COOL_PATH : The paths to a .cool file with a balanced Hi-C map.

    EXPECTED_PATH : The paths to a tsv-like file with expected cis-expected.

    Analysis will be performed for chromosomes referred to in EXPECTED_PATH, and
    therefore these chromosomes must be a subset of chromosomes referred to in
    COOL_PATH. Also chromosomes refered to in EXPECTED_PATH must be non-trivial,
    i.e., contain not-NaN signal. Thus, make sure to prune your EXPECTED_PATH
    before applying this script.

    COOL_PATH and EXPECTED_PATH must be binned at the same resolution.

    EXPECTED_PATH must contain at least the following columns for cis contacts:
    'region', 'diag', 'n_valid', value_name. value_name is controlled using
    options. Header must be present in a file.

    """
    clr = cooler.Cooler(cool_path)

    # preliminary SCHEMA for cis-expected
    region_column_name = "region"
    expected_columns = [region_column_name, "diag", "n_valid", expected_name]
    expected_dtypes = {
        region_column_name: np.str,
        "diag": np.int64,
        "n_valid": np.int64,
        expected_name: np.float64,
    }

    try:
        expected = pd.read_table(
            expected_path,
            usecols=expected_columns,
            dtype=expected_dtypes,
            comment=None,
            verbose=verbose,
        )
    except ValueError as e:
        raise ValueError(
            "input expected does not match the schema\n"
            "tab-separated expected file must have a header as wel")
    expected_index = [
        region_column_name,
        "diag",
    ]
    expected.set_index(expected_index, inplace=True)
    # end of SCHEMA for cis-expected

    # Optional reading region table provided by the user:
    if regions is None:
        try:
            uniq_regions = expected.index.get_level_values(
                region_column_name).unique()
            regions_table = bioframe.parse_regions(uniq_regions,
                                                   clr.chromsizes)
            regions_table["name"] = regions_table["chrom"]
        except ValueError as e:
            print(e)
            raise ValueError(
                "Cannot interpret regions from EXPECTED_PATH\n"
                "specify regions definitions using --regions option.")
    else:
        # Flexible reading of the regions table:
        regions_buf, names = util.sniff_for_header(regions)
        regions_table = pd.read_csv(regions_buf, sep="\t", header=None)
        if regions_table.shape[1] not in (3, 4):
            raise ValueError(
                "The region file does not have three or four tab-delimited columns."
                "We expect a bed file with columns chrom, start, end, and optional name"
            )
        if regions_table.shape[1] == 4:
            regions_table = regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end",
                3: "name"
            })
            regions_table = bioframe.parse_regions(regions_table)
        else:
            regions_table = regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end"
            })
            regions_table = bioframe.parse_regions(regions_table)
        regions_table = regions_table[regions_table["chrom"].isin(
            clr.chromnames)].reset_index(drop=True)

    # Verify appropriate columns order (required for heatmap_tiles_generator_diag):
    regions_table = regions_table[["chrom", "start", "end", "name"]]

    # Input validation
    get_exp_regions = lambda df: df.index.get_level_values(region_column_name
                                                           ).unique()
    expected_regions = get_exp_regions(expected)

    # unique list of regions mentioned in expected_path
    # are also in regions table
    if not set(expected_regions).issubset(regions_table["name"]):
        raise ValueError(
            "Regions in {} must be subset of ".format(expected_path) +
            f"regions in {'regions table'+regions_path if not regions_path is None else 'cooler'}"
        )

    # check number of bins per region in cooler and expected table
    # compute # of bins by comparing matching indexes
    try:
        for region_name, group in expected.reset_index().groupby(
                region_column_name):
            n_diags = group.shape[0]
            region = regions_table.set_index("name").loc[region_name]
            lo, hi = clr.extent(region)
            assert n_diags == (hi - lo)
    except AssertionError:
        raise ValueError("Region shape mismatch between expected and cooler. "
                         "Are they using the same resolution?")
    # All the checks have passed:
    if verbose:
        print("{} and {} passed cross-compatibility checks.".format(
            cool_path, expected_path))

    # by now we have a usable region_table and expected for most scenarios

    # Prepare some parameters.
    binsize = clr.binsize
    loci_separation_bins = int(max_loci_separation / binsize)
    tile_size_bins = int(tile_size / binsize)
    balance_factor = 1.0  # clr._load_attrs("bins/weight")["scale"]

    # clustering would deal with bases-units for now, so supress this for now
    # clustering_radius_bins = int(dots_clustering_radius/binsize)

    # kernels
    # 'upright' is a symmetrical inversion of "lowleft", not needed.
    ktypes = ["donut", "vertical", "horizontal", "lowleft"]

    if (kernel_width is None) or (kernel_peak is None):
        w, p = dotfinder.recommend_kernel_params(binsize)
        print(
            f"Using kernel parameters w={w}, p={p} recommended for binsize {binsize}"
        )
    else:
        w, p = kernel_width, kernel_peak
        # add some sanity check for w,p:
        assert w > p, f"Wrong inner/outer kernel parameters w={w}, p={p}"
        print(f"Using kernel parameters w={w}, p={p} provided by user")

    # once kernel parameters are setup check max_nans_tolerated
    # to make sure kernel footprints overlaping 1 side with the
    # NaNs filled row/column are not "allowed"
    # this requires dynamic adjustment for the "shrinking donut"
    assert max_nans_tolerated <= 2 * w, "Too many NaNs allowed!"
    # may lead to scoring the same pixel twice, - i.e. duplicates.

    # generate standard kernels - consider providing custom ones
    kernels = {k: dotfinder.get_kernel(w, p, k) for k in ktypes}

    # list of tile coordinate ranges
    tiles = list(
        dotfinder.heatmap_tiles_generator_diag(clr, regions_table, w,
                                               tile_size_bins,
                                               loci_separation_bins))

    # lambda-chunking edges ...
    assert dotfinder.HiCCUPS_W1_MAX_INDX <= num_lambda_chunks <= 50
    base = 2**(1 / 3)
    ledges = np.concatenate((
        [-np.inf],
        np.logspace(
            0,
            num_lambda_chunks - 1,
            num=num_lambda_chunks,
            base=base,
            dtype=np.float,
        ),
        [np.inf],
    ))

    # 1. Calculate genome-wide histograms of scores.
    gw_hist = dotfinder.scoring_and_histogramming_step(
        clr,
        expected,
        expected_name,
        weight_name,
        tiles,
        kernels,
        ledges,
        max_nans_tolerated,
        loci_separation_bins,
        nproc,
        verbose,
    )

    if verbose:
        print("Done building histograms ...")

    # 2. Determine the FDR thresholds.
    threshold_df, qvalues = dotfinder.determine_thresholds(
        kernels, ledges, gw_hist, fdr)

    # 3. Filter using FDR thresholds calculated in the histogramming step
    filtered_pixels = dotfinder.scoring_and_extraction_step(
        clr,
        expected,
        expected_name,
        weight_name,
        tiles,
        kernels,
        ledges,
        threshold_df,
        max_nans_tolerated,
        balance_factor,
        loci_separation_bins,
        op.join(op.dirname(out_prefix),
                op.basename(out_prefix) + ".enriched.tsv"),
        nproc,
        verbose,
        bin1_id_name="bin1_id",
        bin2_id_name="bin2_id",
    )

    # 4. Post-processing
    if verbose:
        print(
            f"Begin post-processing of {len(filtered_pixels)} filtered pixels")
        print("preparing to extract needed q-values ...")

    filtered_pixels_qvals = dotfinder.annotate_pixels_with_qvalues(
        filtered_pixels, qvalues, kernels)
    # 4a. clustering
    ########################################################################
    # Clustering has to be done using annotated DataFrame of filtered pixels
    # why ? - because - clustering has to be done independently for every region!
    ########################################################################
    filtered_pixels_annotated = cooler.annotate(filtered_pixels_qvals,
                                                clr.bins()[:])
    filtered_pixels_annotated = assign_regions(filtered_pixels_annotated,
                                               regions_table)
    # consider reseting index here
    centroids = dotfinder.clustering_step(filtered_pixels_annotated,
                                          expected_regions,
                                          dots_clustering_radius, verbose)

    # 4b. filter by enrichment and qval
    postprocessed_calls = dotfinder.thresholding_step(centroids)

    # Final-postprocessed result
    if out_prefix is not None:

        postprocessed_fname = op.join(
            op.dirname(out_prefix),
            op.basename(out_prefix) + ".postproc.bedpe")

        postprocessed_calls.to_csv(postprocessed_fname,
                                   sep="\t",
                                   header=True,
                                   index=False,
                                   compression=None)