def merge_tad_bins(hic, boundary_id_list, filename): """ Reduces the HiCMatrix by merging the counts of tad bins. :param hic: HiCMatrix object :param boundary_id_list list of tad boundary bin ids :param filename Name to save the resulting matrix :return: HiCMatrix object """ from hicexplorer.reduceMatrix import reduce_matrix hic.restoreMaskedBins() ref_name_list, start_list, end_list, coverage_list = zip( *hic.cut_intervals) new_bins = [] bins_to_merge = [] prev_ref = ref_name_list[0] # prepare new intervals idx_start = 0 new_start = start_list[0] count = 0 for idx, ref in enumerate(ref_name_list): if (count > 0 and idx in boundary_id_list) or ref != prev_ref: coverage = np.mean(coverage_list[idx_start:idx]) new_bins.append((ref_name_list[idx_start], new_start, end_list[idx - 1], coverage)) bins_to_merge.append(list(range(idx_start, idx))) idx_start = idx new_start = start_list[idx] count = 0 prev_ref = ref count += 1 # check that the previous for loop ran, otherwise # some variables may not be set if len(bins_to_merge) > 0: coverage = np.mean(coverage_list[idx_start:]) new_bins.append((ref, new_start, end_list[idx], coverage)) bins_to_merge.append(list(range(idx_start, idx + 1))) # remove correction factors otherwise they are # saved but they no longer correspond to the # size of the matrix. hic.correction_factors = None hic.update_matrix( reduce_matrix(hic.matrix, bins_to_merge, diagonal=True), new_bins) hic.save(filename) else: log.info("Nothing to merge.")
def merge_bins(hic, num_bins): """ Merge the bins using the specified number of bins. This functions takes care to make new intervals Parameters ---------- hic : HiCMatrix object num_bins : number of consecutive bins to merge. Returns ------- A sparse matrix. Set up a Hi-C test matrix >>> from scipy.sparse import csr_matrix >>> row, col = np.triu_indices(5) >>> cut_intervals = [('a', 0, 10, 0.5), ('a', 10, 20, 1), ... ('a', 20, 30, 1), ('a', 30, 40, 0.1), ('b', 40, 50, 1)] >>> hic = hm.hiCMatrix() >>> hic.nan_bins = [] >>> matrix = np.array([ ... [ 50, 10, 5, 3, 0], ... [ 0, 60, 15, 5, 1], ... [ 0, 0, 80, 7, 3], ... [ 0, 0, 0, 90, 1], ... [ 0, 0, 0, 0, 100]], dtype=np.int32) make the matrix symmetric: >>> from scipy.sparse import dia_matrix >>> dia = dia_matrix(([matrix.diagonal()], [0]), shape=matrix.shape) >>> hic.matrix = csr_matrix(matrix + matrix.T - dia) >>> hic.setMatrix(hic.matrix, cut_intervals) run merge_matrix >>> merge_matrix = merge_bins(hic, 2) >>> merge_matrix.cut_intervals [('a', 0, 20, 0.75), ('a', 20, 40, 0.55000000000000004), ('b', 40, 50, 1.0)] >>> merge_matrix.matrix.todense() matrix([[120, 28, 1], [ 28, 177, 4], [ 1, 4, 100]], dtype=int32) """ hic = remove_nans_if_needed(hic) # get the bins to merge ref_name_list, start_list, end_list, coverage_list = zip( *hic.cut_intervals) new_bins = [] bins_to_merge = [] prev_ref = ref_name_list[0] # prepare new intervals idx_start = 0 new_start = start_list[0] count = 0 for idx, ref in enumerate(ref_name_list): if (count > 0 and count % num_bins == 0) or ref != prev_ref: if count < num_bins / 2: log.debug("{} has few bins ({}). Skipping it\n".format( prev_ref, count)) else: coverage = np.mean(coverage_list[idx_start:idx]) new_bins.append((ref_name_list[idx_start], new_start, end_list[idx - 1], coverage)) bins_to_merge.append(list(range(idx_start, idx))) idx_start = idx new_start = start_list[idx] count = 0 prev_ref = ref count += 1 coverage = np.mean(coverage_list[idx_start:]) new_bins.append((ref, new_start, end_list[idx], coverage)) bins_to_merge.append(list(range(idx_start, idx + 1))) hic.matrix = reduce_matrix(hic.matrix, bins_to_merge, diagonal=True) hic.matrix.eliminate_zeros() hic.setCutIntervals(new_bins) hic.nan_bins = np.flatnonzero(hic.matrix.sum(0).A == 0) return hic