def merge_tad_bins(hic, boundary_id_list, filename):
    """
    Reduces the HiCMatrix by merging the counts of tad bins.
    :param hic: HiCMatrix object
    :param boundary_id_list list of tad boundary bin ids
    :param filename Name to save the resulting matrix
    :return: HiCMatrix object
    """

    from hicexplorer.reduceMatrix import reduce_matrix
    hic.restoreMaskedBins()
    ref_name_list, start_list, end_list, coverage_list = zip(
        *hic.cut_intervals)
    new_bins = []
    bins_to_merge = []
    prev_ref = ref_name_list[0]

    # prepare new intervals
    idx_start = 0
    new_start = start_list[0]
    count = 0
    for idx, ref in enumerate(ref_name_list):
        if (count > 0 and idx in boundary_id_list) or ref != prev_ref:
            coverage = np.mean(coverage_list[idx_start:idx])
            new_bins.append((ref_name_list[idx_start], new_start,
                             end_list[idx - 1], coverage))
            bins_to_merge.append(list(range(idx_start, idx)))
            idx_start = idx
            new_start = start_list[idx]
            count = 0

        prev_ref = ref
        count += 1
    # check that the previous for loop ran, otherwise
    # some variables may not be set
    if len(bins_to_merge) > 0:
        coverage = np.mean(coverage_list[idx_start:])
        new_bins.append((ref, new_start, end_list[idx], coverage))
        bins_to_merge.append(list(range(idx_start, idx + 1)))
        # remove correction factors otherwise they are
        # saved but they no longer correspond to the
        # size of the matrix.
        hic.correction_factors = None

        hic.update_matrix(
            reduce_matrix(hic.matrix, bins_to_merge, diagonal=True), new_bins)

        hic.save(filename)
    else:
        log.info("Nothing to merge.")
Exemple #2
0
def merge_bins(hic, num_bins):
    """
    Merge the bins using the specified number of bins. This
    functions takes care to make new intervals

    Parameters
    ----------

    hic : HiCMatrix object

    num_bins : number of consecutive bins to merge.

    Returns
    -------

    A sparse matrix.

    Set up a Hi-C test matrix
    >>> from scipy.sparse import csr_matrix
    >>> row, col = np.triu_indices(5)
    >>> cut_intervals = [('a', 0, 10, 0.5), ('a', 10, 20, 1),
    ... ('a', 20, 30, 1), ('a', 30, 40, 0.1), ('b', 40, 50, 1)]
    >>> hic = hm.hiCMatrix()
    >>> hic.nan_bins = []
    >>> matrix = np.array([
    ... [ 50, 10,  5,  3,   0],
    ... [  0, 60, 15,  5,   1],
    ... [  0,  0, 80,  7,   3],
    ... [  0,  0,  0, 90,   1],
    ... [  0,  0,  0,  0, 100]], dtype=np.int32)

    make the matrix symmetric:
    >>> from scipy.sparse import dia_matrix

    >>> dia = dia_matrix(([matrix.diagonal()], [0]), shape=matrix.shape)
    >>> hic.matrix = csr_matrix(matrix + matrix.T - dia)
    >>> hic.setMatrix(hic.matrix, cut_intervals)

    run merge_matrix
    >>> merge_matrix = merge_bins(hic, 2)
    >>> merge_matrix.cut_intervals
    [('a', 0, 20, 0.75), ('a', 20, 40, 0.55000000000000004), ('b', 40, 50, 1.0)]
    >>> merge_matrix.matrix.todense()
    matrix([[120,  28,   1],
            [ 28, 177,   4],
            [  1,   4, 100]], dtype=int32)
    """

    hic = remove_nans_if_needed(hic)
    # get the bins to merge
    ref_name_list, start_list, end_list, coverage_list = zip(
        *hic.cut_intervals)
    new_bins = []
    bins_to_merge = []
    prev_ref = ref_name_list[0]

    # prepare new intervals
    idx_start = 0
    new_start = start_list[0]
    count = 0
    for idx, ref in enumerate(ref_name_list):
        if (count > 0 and count % num_bins == 0) or ref != prev_ref:
            if count < num_bins / 2:
                log.debug("{} has few bins ({}). Skipping it\n".format(
                    prev_ref, count))
            else:
                coverage = np.mean(coverage_list[idx_start:idx])
                new_bins.append((ref_name_list[idx_start], new_start,
                                 end_list[idx - 1], coverage))
                bins_to_merge.append(list(range(idx_start, idx)))
            idx_start = idx
            new_start = start_list[idx]
            count = 0

        prev_ref = ref
        count += 1
    coverage = np.mean(coverage_list[idx_start:])
    new_bins.append((ref, new_start, end_list[idx], coverage))
    bins_to_merge.append(list(range(idx_start, idx + 1)))

    hic.matrix = reduce_matrix(hic.matrix, bins_to_merge, diagonal=True)
    hic.matrix.eliminate_zeros()
    hic.setCutIntervals(new_bins)
    hic.nan_bins = np.flatnonzero(hic.matrix.sum(0).A == 0)

    return hic