def testFlyHiC(): fmcool1 = "tests/data/fly_hi-c/4DNFI8DRD739_bin100kb.cool" fmcool2 = "tests/data/fly_hi-c/4DNFIZ1ZVXC8_bin100kb.cool" cool1, binSize1 = readMcool(fmcool1, -1) cool2, binSize2 = readMcool(fmcool2, -1) #Check various .info() fields for consistency assert coolerInfo(cool1, 'bin-size') == binSize1,\ f'coolerInfo() failed to retrieve metadata \'bin-size\' from {fmcool1}' assert coolerInfo(cool2, 'bin-size') == binSize2,\ f'coolerInfo() failed to retrieve metadata \'bin-size\' from {fmcool2}' assert coolerInfo(cool1, 'sum') == cool1.pixels()['count'][:].sum(),\ f'coolerInfo() failed to retrieve metadata \'sum\' from {fmcool1}' assert coolerInfo(cool2, 'sum') == cool2.pixels()['count'][:].sum(),\ f'coolerInfo() failed to retrieve metadata \'sum\' from {fmcool2}' assert coolerInfo(cool1, 'nbins') == cool1.bins().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nbins\' from {fmcool1}' assert coolerInfo(cool2, 'nbins') == cool2.bins().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nbins\' from {fmcool2}' assert coolerInfo(cool1, 'nnz') == cool1.pixels().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nnz\' from {fmcool1}' assert coolerInfo(cool2, 'nnz') == cool2.pixels().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nnz\' from {fmcool2}' assert coolerInfo(cool1, 'nchroms') == cool1.chroms().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nchroms\' from {fmcool1}' assert coolerInfo(cool2, 'nchroms') == cool2.chroms().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nchroms\' from {fmcool2}'
def testHumanHiCInfo(): fmcool1 = "tests/data/human_hi-c/4DNFITKCX2DO.cool" fmcool2 = "tests/data/human_hi-c/4DNFIQ5XCHDB.cool" cool1, binSize1 = readMcool(fmcool1, -1) cool2, binSize2 = readMcool(fmcool2, -1) #Check various .info() fields for consistency assert coolerInfo(cool1, 'bin-size') == binSize1,\ f'coolerInfo() failed to retrieve metadata \'bin-size\' from {fmcool1}' assert coolerInfo(cool2, 'bin-size') == binSize2,\ f'coolerInfo() failed to retrieve metadata \'bin-size\' from {fmcool2}' assert coolerInfo(cool1, 'sum') == cool1.pixels()['count'][:].sum(),\ f'coolerInfo() failed to retrieve metadata \'sum\' from {fmcool1}' assert coolerInfo(cool2, 'sum') == cool2.pixels()['count'][:].sum(),\ f'coolerInfo() failed to retrieve metadata \'sum\' from {fmcool2}' assert coolerInfo(cool1, 'nbins') == cool1.bins().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nbins\' from {fmcool1}' assert coolerInfo(cool2, 'nbins') == cool2.bins().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nbins\' from {fmcool2}' assert coolerInfo(cool1, 'nnz') == cool1.pixels().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nnz\' from {fmcool1}' assert coolerInfo(cool2, 'nnz') == cool2.pixels().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nnz\' from {fmcool2}' assert coolerInfo(cool1, 'nchroms') == cool1.chroms().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nchroms\' from {fmcool1}' assert coolerInfo(cool2, 'nchroms') == cool2.chroms().shape[0],\ f'coolerInfo() failed to retrieve metadata \'nchroms\' from {fmcool2}'
def testFlyHiC(): fmcool1 = "tests/data/fly_hi-c/4DNFI8DRD739_bin100kb.cool" fmcool2 = "tests/data/fly_hi-c/4DNFIZ1ZVXC8_bin100kb.cool" binSize = -1 h = 1 dBPMax = 500000 bDownSample = False cool1, binSize1 = readMcool(fmcool1, binSize) cool2, binSize2 = readMcool(fmcool2, binSize) assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\ f"Input cool files {fmcool1} and {fmcool2} have different number of bins" assert binSize1 == binSize2,\ f"Input cool files {fmcool1} and {fmcool2} have different bin sizes" assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\ f"Input cool files {fmcool1} and {fmcool2} have different number of chromosomes" assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\ f"Input file {fmcool1} and {fmcool2} have different chromosome names" results = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample) expected = np.array([ 9.936753824600870e-01, 9.950138992224218e-01, 9.951519844417879e-01, 9.935973973292749e-01, 9.933660605077106e-01, 9.927681695925705e-01, 6.238132870270471e-01 ]) assert np.isclose(results, expected).all() # Test the computation of a subset of chromosomes give the same results as # the whole set chrNames = ['chr2L', 'chr2R', 'chrX'] resultsSub = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample, chrNames) chrNamesAll = cool1.chroms()[:]['name'].tolist() chrNamesAll = [name for name in chrNamesAll if name != 'M'] iChrs = np.where(np.isin(chrNamesAll, chrNames))[0] assert (results[iChrs] == resultsSub).all(), f""" SCC scores between {fmcool1} and {fmcool2} on chromosome subset {chrNames} differ from those computed from the whole set. The whole genome results are: {results} and the subset indices are {iChrs}. """ # Test the computation when excluding chromosomes give the same results as # the whole set exclNames = set(['chr2L', 'chr2R', 'chrX']) resultsExcl = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample, chrNamesAll, exclNames) chrNamesRemain = [name for name in chrNamesAll if name not in exclNames] ieChrs = np.where(np.isin(chrNamesAll, chrNamesRemain))[0] assert (results[ieChrs] == resultsExcl).all(), f"""
def testFlyHiC(): fmcool1 = "tests/data/fly_hi-c/4DNFI8DRD739_bin100kb.cool" fmcool2 = "tests/data/fly_hi-c/4DNFIZ1ZVXC8_bin100kb.cool" binSize = -1 h = 1 dBPMax = 500000 bDownSample = False cool1, binSize1 = readMcool(fmcool1, binSize) cool2, binSize2 = readMcool(fmcool2, binSize) assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\ f"Input cool files {fmcool1} and {fmcool2} have different number of bins" assert binSize1 == binSize2,\ f"Input cool files {fmcool1} and {fmcool2} have different bin sizes" assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\ f"Input cool files {fmcool1} and {fmcool2} have different number of chromosomes" assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\ f"Input file {fmcool1} and {fmcool2} have different chromosome names" results = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample) expected = np.array([ 9.936753824600870e-01, 9.950138992224218e-01, 9.951519844417879e-01, 9.935973973292749e-01, 9.933660605077106e-01, 9.927681695925705e-01, 6.238132870270471e-01 ]) assert np.isclose(results, expected).all()
def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int, dBPMax: int, bDownSample: bool, chrNames: list = None, excludeChr: set = None): """Compute hicrep score between two input Cooler contact matrices Args: cool1: `cooler.api.Cooler` Input Cooler contact matrix 1 cool2: `cooler.api.Cooler` Input Cooler contact matrix 2 h: `int` Half-size of the mean filter used to smooth the input matrics dBPMax `int` Only include contacts that are at most this genomic distance (bp) away bDownSample: `bool` Down sample the input with more contacts to the same number of contacts as in the other input chrNames: `list` List of chromosome names whose SCC to compute. Default to None, which means all chromosomes in the genome are used to compute SCC excludeChr: `set` Set of chromosome names to exclude from SCC computation. Default to None. Returns: `float` scc scores for each chromosome """ binSize1 = cool1.binsize binSize2 = cool2.binsize assert binSize1 == binSize2,\ f"Input cool files have different bin sizes" assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\ f"Input cool files have different number of bins" assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\ f"Input cool files have different number of chromosomes" assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\ f"Input file have different chromosome names" binSize = binSize1 bins1 = cool1.bins() bins2 = cool2.bins() if binSize is None: # sometimes bin size can be None, e.g., input cool file has # non-uniform size bins. assert np.all(bins1[:] == bins2[:]),\ f"Input cooler files don't have a unique bin size most likely "\ f"because non-uniform bin size was used and the bins are defined "\ f"differently for the two input cooler files" # In that case, use the median bin size binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values)) warnings.warn(f"Input cooler files don't have a unique bin size most "\ f"likely because non-uniform bin size was used. HicRep "\ f"will use median bin size from the first cooler file "\ f"to determine maximal diagonal index to include", RuntimeWarning) if dBPMax == -1: # this is the exclusive upper bound dMax = coolerInfo(cool1, 'nbins') else: dMax = dBPMax // binSize + 1 assert dMax > 1, f"Input dBPmax is smaller than binSize" p1 = cool2pixels(cool1) p2 = cool2pixels(cool2) # get the total number of contacts as normalizing constant n1 = coolerInfo(cool1, 'sum') n2 = coolerInfo(cool2, 'sum') # Use dict here so that the chrNames don't duplicate if chrNames is None: chrNamesDict = dict.fromkeys(cool1.chroms()[:]['name'].tolist()) else: chrNamesDict = dict.fromkeys(chrNames) # It's important to preserve the order of the input chrNames so that the # user knows the order of the output SCC scores so we bail when encounter # duplicate names rather than implicit prunning the names. assert chrNames is None or len(chrNamesDict) == len(chrNames), f""" Found Duplicates in {chrNames}. Please remove them. """ # filter out excluded chromosomes if excludeChr is None: excludeChr = set() chrNames = [ chrName for chrName in chrNamesDict if chrName not in excludeChr ] scc = np.full(len(chrNames), -2.0) for iChr, chrName in enumerate(chrNames): # normalize by total number of contacts mS1 = getSubCoo(p1, bins1, chrName) assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % ( chrName) assert mS1.shape[0] == mS1.shape[1],\ "Contact matrix 1 of chromosome %s is not square" % (chrName) mS2 = getSubCoo(p2, bins2, chrName) assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % ( chrName) assert mS2.shape[0] == mS2.shape[1],\ "Contact matrix 2 of chromosome %s is not square" % (chrName) assert mS1.shape == mS2.shape,\ "Contact matrices of chromosome %s have different input shape" % (chrName) nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0]) rho = np.full(nDiags, np.nan) ws = np.full(nDiags, np.nan) # remove major diagonal and all the diagonals >= nDiags # to save computation time m1 = trimDiags(mS1, nDiags, False) m2 = trimDiags(mS2, nDiags, False) del mS1 del mS2 if bDownSample: # do downsampling size1 = m1.sum() size2 = m2.sum() if size1 > size2: m1 = resample(m1, size2).astype(float) elif size2 > size1: m2 = resample(m2, size1).astype(float) else: # just normalize by total contacts m1 = m1.astype(float) / n1 m2 = m2.astype(float) / n2 if h > 0: # apply smoothing m1 = meanFilterSparse(m1, h) m2 = meanFilterSparse(m2, h) scc[iChr] = sccByDiag(m1, m2, nDiags) return scc
def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int, dBPMax: int, bDownSample: bool): """Compute hicrep score between two input Cooler contact matrices Args: cool1: `cooler.api.Cooler` Input Cooler contact matrix 1 cool2: `cooler.api.Cooler` Input Cooler contact matrix 2 h: `int` Half-size of the mean filter used to smooth the input matrics dBPMax `int` Only include contacts that are at most this genomic distance (bp) away bDownSample: `bool` Down sample the input with more contacts to the same number of contacts as in the other input Returns: `float` scc scores for each chromosome """ binSize1 = cool1.binsize binSize2 = cool2.binsize assert binSize1 == binSize2,\ f"Input cool files have different bin sizes" assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\ f"Input cool files have different number of bins" assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\ f"Input cool files have different number of chromosomes" assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\ f"Input file have different chromosome names" binSize = binSize1 bins1 = cool1.bins() bins2 = cool2.bins() if binSize is None: # sometimes bin size can be None, e.g., input cool file has # non-uniform size bins. assert np.all(bins1[:] == bins2[:]),\ f"Input cooler files don't have a unique bin size most likely "\ f"because non-uniform bin size was used and the bins are defined "\ f"differently for the two input cooler files" # In that case, use the median bin size binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values)) warnings.warn(f"Input cooler files don't have a unique bin size most "\ f"likely because non-uniform bin size was used. HicRep "\ f"will use median bin size from the first cooler file "\ f"to determine maximal diagonal index to include", RuntimeWarning) if dBPMax == -1: # this is the exclusive upper bound dMax = coolerInfo(cool1, 'nbins') else: dMax = dBPMax // binSize + 1 assert dMax > 1, f"Input dBPmax is smaller than binSize" p1 = cool2pixels(cool1) p2 = cool2pixels(cool2) # get the total number of contacts as normalizing constant n1 = coolerInfo(cool1, 'sum') n2 = coolerInfo(cool2, 'sum') chrNames = cool1.chroms()[:]['name'].to_numpy() # filter out mitochondria chromosome chrNames = np.array([name for name in chrNames if name != 'M']) scc = np.full(chrNames.shape[0], -2.0) for iChr in range(chrNames.shape[0]): chrName = chrNames[iChr] # normalize by total number of contacts mS1 = getSubCoo(p1, bins1, chrName) assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % ( chrName) assert mS1.shape[0] == mS1.shape[1],\ "Contact matrix 1 of chromosome %s is not square" % (chrName) mS2 = getSubCoo(p2, bins2, chrName) assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % ( chrName) assert mS2.shape[0] == mS2.shape[1],\ "Contact matrix 2 of chromosome %s is not square" % (chrName) assert mS1.shape == mS2.shape,\ "Contact matrices of chromosome %s have different input shape" % (chrName) nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0]) rho = np.full(nDiags, np.nan) ws = np.full(nDiags, np.nan) # remove major diagonal and all the diagonals >= nDiags # to save computation time m1 = trimDiags(mS1, nDiags, False) m2 = trimDiags(mS2, nDiags, False) del mS1 del mS2 if bDownSample: # do downsampling size1 = m1.sum() size2 = m2.sum() if size1 > size2: m1 = resample(m1, size2).astype(float) elif size2 > size1: m2 = resample(m2, size1).astype(float) else: # just normalize by total contacts m1 = m1.astype(float) / n1 m2 = m2.astype(float) / n2 if h > 0: # apply smoothing m1 = meanFilterSparse(m1, h) m2 = meanFilterSparse(m2, h) scc[iChr] = sccByDiag(m1, m2, nDiags) return scc