def test_getCountsByDistance(): cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic = hm.hiCMatrix() hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) distance = hic.getCountsByDistance() nt.assert_equal(distance[-1], [0, 1, 2, 1]) nt.assert_equal(distance[0], [1, 4, 0, 0, 0]) nt.assert_equal(distance[10], [8, 15, 0]) nt.assert_equal(distance[20], [5, 5]) nt.assert_equal(distance[30], [3]) hic = hm.hiCMatrix() hic.nan_bins = [] matrix = np.matrix([[np.nan for x in range(5)] for y in range(5)]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) distance = hic.getCountsByDistance() nt.assert_equal(distance[-1], [0, 0, 0, 0]) nt.assert_equal(distance[0], [0, 0, 0, 0, 0]) nt.assert_equal(distance[10], [0, 0, 0]) nt.assert_equal(distance[20], [0, 0]) nt.assert_equal(distance[30], [0])
def test_save_GInteractions(): """ Test fails because GInteractions saves file as .tsv but __init__ can only process .npz, h5, dekker, cool. Otherwise files are treated as h5f... """ outfile = '/tmp/matrix_GInteractions' try: _outfile = open(outfile, 'r') except Exception: _outfile = open(outfile, 'w') _outfile.close() hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) # make matrix symmetric hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hm.hiCMatrix.fillLowerTriangle(hic.matrix) hic.save_GInteractions(outfile) # test fails during load hm.hiCMatrix(outfile)
def test_save_load_cooler_format(): outfile = '/tmp/matrix2.cool' cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic = hm.hiCMatrix() hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) # make matrix symmetric hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hm.hiCMatrix.fillLowerTriangle(hic.matrix) hic.save(outfile) matrix_cool = hm.hiCMatrix(outfile) log.info('original data: {}'.format(hic.matrix)) log.info('cool data: {}'.format(matrix_cool.matrix)) nt.assert_equal(hic.matrix.data, matrix_cool.matrix.data) nt.assert_equal(hic.matrix.indices, matrix_cool.matrix.indices) nt.assert_equal(hic.matrix.indptr, matrix_cool.matrix.indptr) # nan_bins and correction_factor are not supported by cool-format nt.assert_equal(hic.cut_intervals, matrix_cool.cut_intervals) unlink(outfile)
def test_save_load(): outfile = '/tmp/matrix.h5' cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic = hm.hiCMatrix() hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, np.nan, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) # make matrix symmetric hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hm.hiCMatrix.fillLowerTriangle(hic.matrix) hic.correction_factors = np.array([0.5, 1, 2, 3, 4]) hic.nan_bins = np.array([4]) hic.save(outfile) h5 = hm.hiCMatrix(outfile) nt.assert_equal(hic.correction_factors, h5.correction_factors) nt.assert_equal(hic.matrix.data, h5.matrix.data) nt.assert_equal(hic.matrix.indices, h5.matrix.indices) nt.assert_equal(hic.matrix.indptr, h5.matrix.indptr) nt.assert_equal(hic.nan_bins, h5.nan_bins) assert hic.cut_intervals == h5.cut_intervals unlink(outfile)
def test_build_matrix_rf(): outfile = NamedTemporaryFile(suffix='.h5', delete=False) outfile.close() qc_folder = mkdtemp(prefix="testQC_") args = "-s {} {} -rs {} --outFileName {} --QCfolder {} " \ "--restrictionSequence GATC " \ "--danglingSequence GATC " \ "--minDistance 150 " \ "--maxLibraryInsertSize 1500 --threads 4".format(sam_R1, sam_R2, dpnii_file, outfile.name, qc_folder).split() hicBuildMatrix.main(args) test = hm.hiCMatrix(ROOT + "small_test_rf_matrix.h5") new = hm.hiCMatrix(outfile.name) nt.assert_equal(test.matrix.data, new.matrix.data) nt.assert_equal(test.cut_intervals, new.cut_intervals) print(set(os.listdir(ROOT + "QC_rc/"))) assert are_files_equal(ROOT + "QC_rc/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC_rc/")) == set(os.listdir(qc_folder)) os.unlink(outfile.name) shutil.rmtree(qc_folder)
def main(args=None): args = parse_arguments().parse_args(args) hic = hm.hiCMatrix(args.matrices[0]) summed_matrix = hic.matrix nan_bins = set(hic.nan_bins) for matrix in args.matrices[1:]: hic_to_append = hm.hiCMatrix(matrix) if hic.chrBinBoundaries != hic_to_append.chrBinBoundaries: log.error( "The two matrices have different chromosome order. Use the tool `hicExport` to change the order.\n" "{}: {}\n" "{}: {}".format(args.matrices[0], list(hic.chrBinBoundaries), matrix, list(hic_to_append.chrBinBoundaries))) exit(1) try: summed_matrix = summed_matrix + hic_to_append.matrix if len(hic_to_append.nan_bins): nan_bins = nan_bins.union(hic_to_append.nan_bins) except Exception: log.exception( "\nMatrix {} seems to be corrupted or of different shape". format(matrix)) exit(1) # save only the upper triangle of the # symmetric matrix hic.setMatrixValues(summed_matrix) hic.maskBins(sorted(nan_bins)) hic.save(args.outFileName) return
def test_build_matrix_cooler(): outfile = NamedTemporaryFile(suffix='.cool', delete=False) outfile.close() qc_folder = mkdtemp(prefix="testQC_") args = "-s {} {} --outFileName {} -bs 5000 -b /tmp/test.bam --QCfolder {} --threads 4".format(sam_R1, sam_R2, outfile.name, qc_folder).split() hicBuildMatrix.main(args) test = hm.hiCMatrix(ROOT + "small_test_matrix_parallel.h5") new = hm.hiCMatrix(outfile.name) nt.assert_equal(test.matrix.data, new.matrix.data) # nt.assert_equal(test.cut_intervals, new.cut_intervals) nt.assert_equal(len(new.cut_intervals), len(test.cut_intervals)) cut_interval_new_ = [] cut_interval_test_ = [] for x in new.cut_intervals: cut_interval_new_.append(x[:3]) for x in test.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) # print(set(os.listdir(ROOT + "QC/"))) assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder)) os.unlink(outfile.name) shutil.rmtree(qc_folder)
def run_compare(pInputFile, pInputFormat, pOutputFormat, pChrNameList=None): outfile = NamedTemporaryFile(suffix='.' + pOutputFormat, delete=False) outfile.close() inputMatrix = pInputFile args = "--inFile {} --inputFormat {} "\ "--outFileName {} " \ "--outputFormat {}".format(inputMatrix, pInputFormat, outfile.name, pOutputFormat).split() hicExport.main(args) test = hm.hiCMatrix(inputMatrix) new = hm.hiCMatrix(outfile.name) nt.assert_equal(new.matrix.data, test.matrix.data) nt.assert_equal(len(new.cut_intervals), len(test.cut_intervals)) cut_interval_new_ = [] cut_interval_test_ = [] for x in new.cut_intervals: cut_interval_new_.append(x[:3]) for x in test.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) os.unlink(outfile.name) return True
def test_save_bing_ren(): """ Test needs to be marked as xfail because .gz files are expected in __init__ to be in dekker file format """ outfile = '/tmp/matrix.gz' try: _outfile = open(outfile, 'r') except Exception: _outfile = open(outfile, 'w') _outfile.close() hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) # make matrix symmetric hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hic.fillLowerTriangle(hic.matrix) hic.save_bing_ren(outfile) # Test fails here due to __init__ of hiCMatrix hm.hiCMatrix(outfile)
def test_save_dekker(): outfile = '/tmp/matrix.gz' try: _outfile = open(outfile, 'r') except Exception: _outfile = open(outfile, 'w') _outfile.close() hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) # make matrix symmetric hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hm.hiCMatrix.fillLowerTriangle(hic.matrix) hic.save_dekker(outfile) dekker_test = hm.hiCMatrix(outfile) dekker_test.fillLowerTriangle(dekker_test.matrix) nt.assert_equal(hic.getMatrix().shape, dekker_test.getMatrix().shape) nt.assert_equal(hic.getMatrix(), dekker_test.getMatrix())
def main(): args = parse_arguments().parse_args() # create hiC matrix with given input format # additional file needed for lieberman format if args.inputFormat == 'lieberman': if args.chrNameList is None: exit("Error: --chrNameList is required when the input format is lieberman. ") else: hic_ma = hm.hiCMatrix(matrixFile=args.inFile, file_format='lieberman', chrnameList=args.chrNameList) elif args.inputFormat == 'npz' and len(args.inFile) > 1: # assume hicexplorer_multi format if args.bplimit: sys.stderr.write("\nCutting maximum matrix depth to {} for saving\n".format(args.bplimit)) matrix, cut_intervals, nan_bins, corrections_factors, distance_counts = \ combine_matrices(args.inFile, bplimit=args.bplimit) hic_ma = hm.hiCMatrix() hic_ma.setMatrix(matrix, cut_intervals=cut_intervals) if len(nan_bins): hic_ma.nan_bins = nan_bins if corrections_factors is not None: hic_ma.correction_factors = corrections_factors if distance_counts is not None: hic_ma.distance_counts = distance_counts else: hic_ma = hm.hiCMatrix(matrixFile=args.inFile[0], file_format=args.inputFormat) if args.bplimit: from scipy.sparse import triu sys.stderr.write("\nCutting maximum matrix depth to {} for saving\n".format(args.bplimit)) limit = int(args.bplimit / hic_ma.getBinSize()) hic_ma.matrix = (triu(hic_ma.matrix, k=-limit) - triu(hic_ma.matrix, k=limit)).tocsr() hic_ma.matrix.eliminate_zeros() if args.chromosomeOrder: hic_ma.keepOnlyTheseChr(args.chromosomeOrder) if args.clearMaskedBins: hic_ma.maskBins(hic_ma.nan_bins) sys.stderr.write('saving...\n') if args.outputFormat == 'dekker': hic_ma.save_dekker(args.outFileName) elif args.outputFormat == 'ren': hic_ma.save_bing_ren(args.outFileName) elif args.outputFormat == 'lieberman': hic_ma.save_lieberman(args.outFileName) elif args.outputFormat == 'npz': hic_ma.save_npz(args.outFileName) elif args.outputFormat == 'GInteractions': hic_ma.save_GInteractions(args.outFileName) else: hic_ma.save(args.outFileName)
def main(args=None): args = parse_arguments().parse_args(args) hic_t = hm.hiCMatrix(matrixFile=args.treatment) hic_c = hm.hiCMatrix(matrixFile=args.control) # scale larger matrix down total_t = hic_t.matrix.sum() total_c = hic_c.matrix.sum() if total_c > total_t: scale_factor = [1, float(total_t) / total_c] else: scale_factor = [float(total_c) / total_t, 1] hic_t.matrix.data = hic_t.matrix.data * scale_factor[0] hic_c.matrix.data = hic_c.matrix.data * scale_factor[1] """ Uses sparse matrix tricks to convert into a vector the matrix values such that zero values that appear in only one of the matrices is kept. But zeros in two matrices are removed Requires two sparse matrices as input """ assert hic_t.matrix.shape == hic_c.matrix.shape, log.error( "Matrices have different shapes.") assert (hic_t.matrix - hic_c.matrix).sum() != 0, log.error("Matrices are identical.") # create a new matrix that is the sum of the two # matrices to compare. The goal is to have # a matrix that contains all the positions # that are non-zero in both matrices _mat = hic_t.matrix + hic_c.matrix # add one to each element in the new matrix _mat.data += 1 # get a vector of the values in hic_t from # _mat values_t = (_mat - hic_t.matrix).data - 1 # get a vector of the values in hic_c from # _mat values_c = (_mat - hic_c.matrix).data - 1 # compute log2ratio _mat.data = np.log2(values_t / values_c) hic_t.matrix = _mat hic_t.matrix.eliminate_zeros() hic_t.save(args.outFileName)
def test_hic_transfer_covariance(): outfile = NamedTemporaryFile(suffix='covariance_.h5', delete=False) outfile.close() matrix = ROOT + "/hicTransform/pearson_small_50kb.h5" args = "--matrix {} --outFileName {} --method covariance".format( matrix, outfile.name).split() hicTransform.main(args) test = hm.hiCMatrix(ROOT + "hicTransform/covariance_small_50kb.h5") new = hm.hiCMatrix(outfile.name) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data) os.unlink(outfile.name)
def test_hic_transfer_obs_exp(): outfile = NamedTemporaryFile(suffix='obs_exp_.h5', delete=False) outfile.close() args = "--matrix {} --outFileName {} --method obs_exp".format( original_matrix, outfile.name).split() hicTransform.main(args) test = hm.hiCMatrix(ROOT + "hicTransform/obs_exp_small_50kb.h5") new = hm.hiCMatrix(outfile.name) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data) os.unlink(outfile.name)
def test_save_hdf5(): """ Test is running for 2 different configurations: save_hdf5(filename, pSymmetric=True) (Default) save_hdf5(filename, pSymmetric=True) """ outfile = '/tmp/matrix.h5' hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hic.fillLowerTriangle(hic.matrix) hic.save_hdf5(outfile) hdf5_test = hm.hiCMatrix(outfile) nt.assert_equal(hic.getMatrix(), hdf5_test.getMatrix()) # Test pSymmetric=False hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) hic.save_hdf5(outfile) hdf5_test_pSym_False = hm.hiCMatrix(outfile) hdf5_test_pSym_False.matrix = hdf5_test_pSym_False.fillLowerTriangle( hdf5_test_pSym_False.matrix) hic.matrix = hic.fillLowerTriangle(hic.matrix) nt.assert_equal(hic.getMatrix(), hdf5_test_pSym_False.getMatrix())
def test_correct_matrix(): outfile = NamedTemporaryFile(suffix='.h5', delete=False) outfile.close() args = "--matrix {} --numBins 5 " \ " --outFileName {}".format(ROOT + "small_test_matrix.h5", outfile.name).split() hicMergeMatrixBins.main(args) test = hm.hiCMatrix(ROOT + "hicMergeMatrixBins/result.h5") new = hm.hiCMatrix(outfile.name) nt.assert_equal(test.matrix.data, new.matrix.data) nt.assert_equal(test.cut_intervals, new.cut_intervals) os.unlink(outfile.name)
def test_save(): """ Test will not cover testing of following formats due to unsupported file_formats (see __init__ of class hiCMatrix): * ren * lieberman * GInteractions see also single test for these formats (marked as xfail) """ matrix_h5 = '/tmp/matrix.h5' matrix_cool = '/tmp/matrix.cool' matrix_npz = '/tmp/matrix.npz' matrix_gz = '/tmp/matrix.gz' hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hic.fillLowerTriangle(hic.matrix) # test .h5 hic.save(matrix_h5) h5_test = hm.hiCMatrix(matrix_h5) # test cool hic.save(matrix_cool) cool_test = hm.hiCMatrix(matrix_cool) # test npz hic.save(matrix_npz) npz_test = hm.hiCMatrix(matrix_npz) # test dekker hic.save(matrix_gz) dekker_test = hm.hiCMatrix(matrix_gz) nt.assert_equal(hic.getMatrix(), h5_test.getMatrix()) nt.assert_equal(hic.getMatrix(), cool_test.getMatrix()) nt.assert_equal(hic.getMatrix(), npz_test.getMatrix()) nt.assert_equal(hic.getMatrix(), dekker_test.getMatrix())
def main(): args = parse_arguments().parse_args() hic = hm.hiCMatrix(args.matrix) if args.runningWindow: merged_matrix = running_window_merge_v2(hic, args.numBins) else: merged_matrix = merge_bins(hic, args.numBins) print 'saving matrix' # there is a pickle problem with large arrays # To increase the sparsity of the matrix and # overcome the problem # I transform al ones into zeros. """ merged_matrix.matrix.data = merged_matrix.matrix.data - 1 """ merged_matrix.matrix.eliminate_zeros() if merged_matrix.correction_factors is not None: sys.stderr.write( "*WARNING*: The corrections factors are not merged and are set to None\n" ) merged_matrix.correction_factors = None merged_matrix.save(args.outFileName)
def test_convert_to_zscore_matrix(): # make test matrix m_size = 100 mat = np.triu(np.random.random_integers(0, 100, (m_size, m_size))) # add a number of zeros mat[mat < 90] = 0 # import ipdb;ipdb.set_trace() mu = dict([(idx, mat.diagonal(idx).mean()) for idx in range(mat.shape[0])]) std = dict([(idx, np.std(mat.diagonal(idx))) for idx in range(mat.shape[0])]) # compute z-score for test matrix zscore_mat = np.zeros((m_size, m_size)) for _i in range(mat.shape[0]): for _j in range(mat.shape[0]): if _j >= _i: diag = _j - _i zscore = (mat[_i, _j] - mu[diag]) / std[diag] zscore_mat[_i, _j] = zscore # make Hi-C matrix based on test matrix hic = hm.hiCMatrix() hic.matrix = csr_matrix(mat) cut_intervals = [('chr', idx, idx + 10, 0) for idx in range(0, mat.shape[0] * 10, 10)] hic.setMatrix(hic.matrix, cut_intervals) hic.convert_to_zscore_matrix() from numpy.testing import assert_almost_equal assert_almost_equal(hic.matrix.todense(), zscore_mat)
def test_reorderMatrix(): orig = (1, 3) dest = 2 # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # reorder matrix hic.reorderMatrix(orig, dest) new_matrix = np.matrix([[1, 3, 8, 5, 0], [0, 0, 0, 0, 1], [0, 5, 4, 15, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 0]]) new_cut_intervals = [('a', 0, 10, 1), ('b', 30, 40, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)] # check if it is equal nt.assert_equal(hic.getMatrix(), new_matrix) nt.assert_equal(hic.matrix.shape, new_matrix.shape) nt.assert_equal(hic.cut_intervals, new_cut_intervals)
def test_removeBySequencedCount(): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # function returns directly if last entry of cut_intervals not float64 _, _, _, coverage = zip(*hic.cut_intervals) assert type(coverage[0]) != np.float64 # define expected outcome to_remove_expected = None # and test outcome to_remove = hic.removeBySequencedCount() nt.assert_equal(to_remove, to_remove_expected)
def main(): args = parse_arguments().parse_args() for matrix in args.matrices: print("File:\t{}".format(matrix)) hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = hic_ma.matrix.sum() / 2 bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() chromosomes = hic_ma.chrBinBoundaries.keys() print("Size:\t{:,}".format(size)) print("Sum:\t{:,}".format(sum_elements)) print("Bin_length:\t{}".format(bin_length)) print("Chromosomes:\t{}".format(", ".join(chromosomes))) print("Non-zero elements:\t{:,}".format(num_non_zero)) print("Minimum (non zero):\t{}".format(min_non_zero)) print("Maximum:\t{}".format(max_non_zero)) print("NaN bins:\t{}".format(num_nan_bins)) print("")
def test_removeBins(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) new_matrix = np.matrix([[1, 3, 0], [0, 0, 1], [0, 0, 0]]) ids2remove = [1, 2] hic.removeBins(ids2remove) nt.assert_equal(hic.getMatrix(), new_matrix) nt.assert_equal(hic.matrix.shape, new_matrix.shape) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('a', (0, 1)), ('b', (1, 3))])) nt.assert_equal(hic.cut_intervals, [('a', 0, 10, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)])
def test_reorderChromosomes_old(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) new_chr_order = ['b', 'a'] hic.reorderChromosomes_old(new_chr_order) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('b', (0, 2)), ('a', (2, 5))])) old_chr_order = ['a', 'b'] hic.reorderChromosomes_old(old_chr_order) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('a', (0, 3)), ('b', (3, 5))])) # new order too long will cause function to return false_chr_order = ['a', 'b', 'c'] hic.reorderChromosomes_old(false_chr_order) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('a', (0, 3)), ('b', (3, 5))]))
def test_get_chromosome_sizes(): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # define expected outcome expected_sizes = OrderedDict([('a', 30), ('b', 50)]) chrom_sizes = hic.get_chromosome_sizes() nt.assert_equal(chrom_sizes, expected_sizes) # define new intervals and test again new_cut_intervals = [('a', 0, 10, 1), ('b', 10, 20, 1), ('b', 20, 30, 1), ('c', 30, 40, 1), ('c', 40, 90, 1)] expected_sizes = OrderedDict([('a', 10), ('b', 30), ('c', 90)]) hic.setMatrix(hic.matrix, new_cut_intervals) chrom_sizes = hic.get_chromosome_sizes() nt.assert_equal(chrom_sizes, expected_sizes)
def chr_diagonals(matrix_file_name, chromosome_exclude): """ extract the counts by distance per chromosome """ hic_ma = HiCMatrix.hiCMatrix(matrix_file_name) print "removing unwanted chrs" hic_ma.filterUnwantedChr() if chromosome_exclude is None: chromosome_exclude = [] chrtokeep = [ x for x in hic_ma.interval_trees.keys() if x not in chromosome_exclude ] print "Number of contacts {}".format(hic_ma.matrix.sum()) hic_ma.keepOnlyTheseChr(chrtokeep) diagonal_dict = hic_ma.getCountsByDistance(per_chr=True) common_dist = None max_dist = 0 chrom_list = hic_ma.getChrNames() for chrom in chrom_list: try: distances = diagonal_dict[chrom].keys() distances[0] except (KeyError, IndexError): continue # get list of common distances if max(distances) > max_dist: max_dist = max(distances) if common_dist is None: common_dist = set(distances) else: common_dist = common_dist.intersection(distances) return diagonal_dict, chrom_list, list(common_dist), max_dist
def test_update_matrix(capsys): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) new_cut_intervals = [('c', 0, 10, 1), ('d', 10, 20, 1), ('d', 20, 30, 1)] new_matrix = np.array([[3, 6, 4], [np.nan, 0, 2], [1, 0, 0]]) try: hic.update_matrix(new_matrix, new_cut_intervals) except AttributeError: pass # if matrix.shape[0] not equal to length of cut_intervals assertionError is raised short_cut_intervals = [('c', 0, 10, 1), ('d', 10, 20, 1)] with pytest.raises(AssertionError): hic.update_matrix(new_matrix, short_cut_intervals) # if matrix contains masked bins exception is raised masking_ids = [0, 1] hic.maskBins(masking_ids) with pytest.raises(Exception): hic.update_matrix(new_matrix, new_cut_intervals)
def main(args=None): args = parse_arguments().parse_args(args) hicma = hm.hiCMatrix(args.matrix) prev_chrom = None prev_start = None bed_h = readBed.ReadBed(args.tads) sum_tads = 0 sum_inter = 0 for bed in bed_h: if prev_chrom is None or bed.chromosome != prev_chrom: start_bin, end_bin = hicma.getRegionBinRange( bed.chromosome, bed.start, bed.end) sum_tads += hicma.matrix[start_bin:end_bin, start_bin:end_bin].sum() prev_chrom = bed.chromosome prev_start = start_bin continue start_bin, end_bin = hicma.getRegionBinRange(bed.chromosome, bed.start, bed.end) sum_inter += hicma.matrix[prev_start:start_bin, start_bin:end_bin].sum() sum_tads += hicma.matrix[start_bin:end_bin, start_bin:end_bin].sum() print "sum tads\t{}\nsum inter\t{}\nratio inter/tads\t{:.3f}".format( sum_tads, sum_inter, sum_inter / sum_tads)
def test_filterUnwantedChr(): hic = hm.hiCMatrix(ROOT + 'small_test_matrix.h5') assert 'chr2RHet' in hic.getChrNames() assert 'chr3LHet' in hic.getChrNames() assert 'chr3RHet' in hic.getChrNames() hic.filterUnwantedChr() assert 'chr2RHet' not in hic.getChrNames() assert 'chr3LHet' not in hic.getChrNames() assert 'chr3RHet' not in hic.getChrNames() chromosomes = list(hic.getChrNames()) # make sure there are any other chromosomes than 'chrX' assert any(x != 'chrX' for x in chromosomes) # then filter for 'chrX' hic.filterUnwantedChr(chromosome='chrX') chromosomes = list(hic.getChrNames()) # and check that there are only 'chrX'-chromosomes left in matrix assert all(x == 'chrX' for x in chromosomes)
def test_keepOnlyTheseChr(): chromosome_list = ['chrX', 'chr2RHet'] hic = hm.hiCMatrix(ROOT + 'small_test_matrix.h5') hic.keepOnlyTheseChr(chromosome_list) nt.assert_equal(hic.getChrNames().sort(), chromosome_list.sort())