def test_save_cool_apply_division(): cool_outfile = outfile + '.cool' # create matrixFileHandler instance with filetype 'cool' pMatrixFile = ROOT + 'Li_et_al_2015.cool' fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionOperator='/') assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # set matrix variables fh_new = MatrixFileHandler(pFileType='cool', pCorrectionOperator='/') fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # and save it. fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile, pCorrectionOperator='/') assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() pMatrixFile = ROOT + 'Li_et_al_2015.cool' fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionOperator='/') assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=1) nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test) os.unlink(cool_outfile)
def test_load_cool_matrix_only(): pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pLoadMatrixOnly=True) matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() assert len(matrix) == 4 assert cut_intervals is None assert nan_bins is None assert distance_counts is None assert correction_factors is None matrixFileHandlerInput2 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) matrix2, cut_intervals2, nan_bins2, \ distance_counts2, correction_factors2 = matrixFileHandlerInput2.load() instances, features = matrix2.nonzero() nt.assert_almost_equal(matrix[0], instances, decimal=1) nt.assert_almost_equal(matrix[1], features, decimal=1) nt.assert_almost_equal(matrix[2], matrix2.data, decimal=1) assert matrix[3] == matrix2.shape[0]
def test_load_distance_cool(): cool_outfile = outfile + '.cool' # create matrixFileHandler instance with filetype 'cool' pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pChrnameList=['1'], pDistance=2500000) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) # set matrix variables fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # and save it. fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True) fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load( ) # check distance load works as expected instances, features = matrix.nonzero() distances = np.absolute(instances - features) # log.debug('max: {}'.format(np.max(distances))) mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance assert np.sum(mask) == 0 fh = MatrixFileHandler(pFileType='cool', pChrnameList=['1'], pMatrixFile=pMatrixFile) assert fh is not None # load data matrix2, _, _, _, _ = fh.load() instances, features = matrix2.nonzero() distances = np.absolute(instances - features) mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance assert np.sum(mask) > 0 # check if load and save matrix are equal nt.assert_equal(matrix.data, matrix_test.data) nt.assert_equal(cut_intervals, cut_intervals_test) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test) nt.assert_equal(correction_factors, correction_factors_test) os.unlink(cool_outfile)
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pClusterName, pQueue): counter = 0 consensus_matrix = None try: matrixFileHandlerInput = MatrixFileHandler( pFileType='cool', pMatrixFile=pMatrixName + '::' + pClusterMatricesList[0]) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() consensus_matrix = _matrix for j, matrix in enumerate(pClusterMatricesList[1:]): matrixFileHandlerInput = MatrixFileHandler( pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix, pLoadMatrixOnly=True) _matrix, _, _, _, _ = matrixFileHandlerInput.load() _matrix = csr_matrix((_matrix[2], (_matrix[0], _matrix[1])), (_matrix[3], _matrix[3]), dtype=np.float) if consensus_matrix is None: consensus_matrix = _matrix else: consensus_matrix += _matrix hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pMatrixFile='consensus_matrix_cluster_' + str(pClusterName) + ':' + str(len(pClusterMatricesList)), pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=hic2CoolVersion) matrixFileHandlerOutput.set_matrix_variables(consensus_matrix, cut_intervals, nan_bins, correction_factors, distance_counts) if counter > 0: log.info( '{} matrices were not considered because of a wrong size.'. format(counter)) except Exception as exp: log.debug('exception! {}'.format(str(exp))) log.debug('computaiton of {} done'.format(str(pClusterName))) pQueue.put(matrixFileHandlerOutput)
def test_load_hicpro(capsys): # create matrixFileHandler instance with filetype 'hicpro' pMatrixFile = ROOT + 'test_matrix.hicpro' pBedFileHicPro = ROOT + 'test_matrix.bed' fh = MatrixFileHandler(pFileType='hicpro', pMatrixFile=pMatrixFile, pBedFileHicPro=pBedFileHicPro) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # create test matrix test_list = [0. for i in range(3113)] test_list.insert(0, 41.345793) test_list[827] = 5.42079 test_list[1263] = 5.122642 test_matrix = np.array([test_list]) # and check for shape and values assert matrix[0].todense().shape == test_matrix.shape nt.assert_almost_equal(matrix[0].todense(), test_matrix) test_cut_intervals = np.array([('chr1', 0, 1000000, 1), ('chr1', 1000000, 2000000, 2), ('chr1', 2000000, 3000000, 3), ('chr1', 3000000, 4000000, 4), ('chr1', 4000000, 5000000, 5), ('chr1', 5000000, 6000000, 6), ('chr1', 6000000, 7000000, 7), ('chr1', 7000000, 8000000, 8), ('chr1', 8000000, 9000000, 9), ('chr1', 9000000, 10000000, 10), ('chr1', 10000000, 11000000, 11), ('chr1', 11000000, 12000000, 12), ('chr1', 12000000, 13000000, 13), ('chr1', 13000000, 14000000, 14), ('chr1', 14000000, 15000000, 15), ('chr1', 15000000, 16000000, 16), ('chr1', 16000000, 17000000, 17), ('chr1', 17000000, 18000000, 18), ('chr1', 18000000, 19000000, 19), ('chr1', 19000000, 20000000, 20)]) nt.assert_equal(cut_intervals[0:20], test_cut_intervals) assert nan_bins is None assert correction_factors is None assert distance_counts is None
def compute_read_coverage_sparsity(pMatrixName, pMatricesList, pXDimension, pMaximumRegionToConsider, pQueue): read_coverage = [] sparsity = [] log.debug('read covarage and sparsity') hic_ma = hm.hiCMatrix(pMatrixFile=pMatrixName + '::' + pMatricesList[0]) bin_size = hic_ma.getBinSize() shape_x = hic_ma.matrix.shape[0] for i, matrix in enumerate(pMatricesList): matrixFileHandler = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix, pLoadMatrixOnly=True) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandler.load() max_distance = pMaximumRegionToConsider // bin_size instances = _matrix[0] features = _matrix[1] distances = np.absolute(instances - features) mask = distances <= max_distance sparsity_length = len(_matrix[2][mask]) sparsity.append(sparsity_length / (shape_x * max_distance)) # only upper half is loaded --> times 2 read_coverage_sum = _matrix[2].sum() * 2 # minus the double main diagonal mask = distances == 0 read_coverage_sum -= _matrix[2][mask].sum() read_coverage.append(read_coverage_sum) pQueue.put([read_coverage, sparsity])
def test_hicConvertFormat_h5_to_homer(): outfile = NamedTemporaryFile(suffix='.homer', delete=False) outfile.close() args = "--matrices {} --outFileName {} --inputFormat cool --outputFormat homer ".format( original_matrix_cool_chr4, outfile.name).split() # hicConvertFormat.main(args) compute(hicConvertFormat.main, args, 5) test = hm.hiCMatrix(original_matrix_cool_chr4) f = gzip.open(outfile.name, 'rb') file_content = f.read() outfile2 = NamedTemporaryFile(suffix='.homer', delete=False) outfile2.close() with open(outfile2.name, 'wb') as matrix_file: matrix_file.write(file_content) matrixFileHandlerInput = MatrixFileHandler(pFileType='homer', pMatrixFile=outfile2.name) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() nt.assert_array_almost_equal(test.matrix.data, _matrix.data, decimal=0)
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pAppend, pQueue): cluster_consensus_matrices_list = [] for i, cluster in enumerate(pClusterMatricesList): consensus_matrix = None if i == 0 and pAppend: append = False else: append = True for matrix in cluster: matrixFileHandlerInput = MatrixFileHandler( pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() if consensus_matrix is None: consensus_matrix = _matrix else: consensus_matrix += _matrix hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=hic2CoolVersion) matrixFileHandlerOutput.set_matrix_variables(consensus_matrix, cut_intervals, nan_bins, correction_factors, distance_counts) cluster_consensus_matrices_list.append(matrixFileHandlerOutput) pQueue.put(cluster_consensus_matrices_list)
def test_load_cool(capsys): # create matrixFileHandler instance with filetype 'cool' pMatrixFile = ROOT + 'Li_et_al_2015.cool' fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # test matrix test_matrix = np.array([[0. for i in range(11104)]]) nt.assert_almost_equal(matrix[0].todense(), test_matrix) test_cut_intervals = [('X', 0, 2200, 1.0), ('X', 2200, 4702, 1.0), ('X', 4702, 7060, 1.0), ('X', 7060, 8811, 1.0), ('X', 8811, 11048, 1.0), ('X', 11048, 14329, 1.0), ('X', 14329, 16847, 1.0), ('X', 16847, 19537, 1.0), ('X', 19537, 20701, 1.0), ('X', 20701, 22321, 1.0), ('X', 22321, 24083, 1.0), ('X', 24083, 25983, 1.0), ('X', 25983, 27619, 1.0), ('X', 27619, 29733, 1.0), ('X', 29733, 30973, 1.0), ('X', 30973, 32214, 1.0), ('X', 32214, 34179, 1.0), ('X', 34179, 35987, 1.0), ('X', 35987, 37598, 1.0), ('X', 37598, 39009, 1.0)] for index, tup in enumerate(cut_intervals[0:20]): for ind, element in enumerate(tup): assert element == test_cut_intervals[index][ind] test_nan_bins = [0, 1, 2, 3, 4, 5, 6, 7, 30, 31] nt.assert_almost_equal(nan_bins[0:10], test_nan_bins) test_correction_factors = [0., 0., 0., 0., 0., 0., 0., 0., 1.1022922, 0.796711] nt.assert_almost_equal(correction_factors[0:10], test_correction_factors) assert distance_counts is None
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) matrix_file_handler_object_list = [] matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=args.matrices[0]) _matrix, cut_intervals_all, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() matrices_list = args.matrices threads = args.threads matrixFileHandler_list = [None] * args.threads process = [None] * args.threads queue = [None] * args.threads thread_done = [False] * args.threads matricesPerThread = len(matrices_list) // threads for i in range(args.threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=load_cool_files, kwargs=dict(pMatricesList=matrices_name_list, pCutIntervals=cut_intervals_all, pQueue=queue[i])) process[i].start() all_data_collected = False while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): matrixFileHandler_list[i] = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) matrix_file_handler_object_list = [ item for sublist in matrixFileHandler_list for item in sublist ] matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False)
def test_load_cool_hic2cool_versions(): pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool' hic2cool_042 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR', pCorrectionOperator='*') pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' hic2cool_051 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR') # hic2cool_051 = MatrixFileHandler(pFileType='h5', pMatrixFile=, pCorrectionFactorTable='KR') # hic2cool_042 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool') # hic2cool_051 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool') # hic2cool_041 = hm.hiCMatrix(outfile.name) matrix, cut_intervals, nan_bins, distance_counts, correction_factors = hic2cool_042.load( ) matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = hic2cool_051.load( ) nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0) nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test)
def load_cool_files(pMatrixName, pMatricesList, pCutIntervals, pQueue): matrixFileHandlerList = [] try: for i, matrix in enumerate(pMatricesList): matrixFileHandlerInput = MatrixFileHandler( pFileType='cool', pMatrixFile=pMatrixName + "::" + matrix, pNoCutIntervals=True) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pMatrixFile=matrix.split('/')[-1]) matrixFileHandlerOutput.set_matrix_variables( _matrix, pCutIntervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerList.append(matrixFileHandlerOutput) except Exception as exp: pQueue.put('Fail: ' + str(exp) + traceback.format_exc()) return pQueue.put(matrixFileHandlerList)
def test_load_h5(capsys): # create matrixFileHandler instance with filetype 'h5' pMatrixFile = ROOT + 'Li_et_al_2015.h5' fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) test_matrix = np.array([[0. for i in range(11104)]]) nt.assert_almost_equal(matrix[0].todense(), test_matrix) nt.assert_equal(cut_intervals[0], ('X', 0, 2200, 0.0)) nt.assert_equal(cut_intervals[1], ('X', 2200, 4702, 0.0)) nt.assert_equal(cut_intervals[2], ('X', 4702, 7060, 0.0)) nt.assert_equal(cut_intervals[3], ('X', 7060, 8811, 0.4)) test_nan_bins = np.array([ 0, 1, 2, 3, 4, 5, 6, 7, 30, 31, 32, 51, 52, 53, 54, 81, 82, 83, 84, 94 ]) # noqa E501 nt.assert_equal(nan_bins[0:20], test_nan_bins) assert distance_counts is None test_correction_factors = np.array( [0, 0, 0, 0, 0, 0, 0, 0, 0.90720049, 1.25516028]) # noqa E501 nt.assert_almost_equal(correction_factors[0:10], test_correction_factors)
def load_cool_files(pMatricesList, pCutIntervals, pQueue): matrixFileHandlerList = [] for i, matrix in enumerate(pMatricesList): try: matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix, pNoCutIntervals=True) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix) matrixFileHandlerOutput.set_matrix_variables( _matrix, pCutIntervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerList.append(matrixFileHandlerOutput) except Exception as exp: log.warning( 'File could not be opend and is excluded: {}. Error message: {} ' .format(matrix, str(exp))) pQueue.put(matrixFileHandlerList)
def test_save_cool_enforce_integer(): cool_outfile = outfile + '.cool' # create matrixFileHandler instance with filetype 'h5' pMatrixFile = ROOT + 'Li_et_al_2015.h5' fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) # set matrix variables fh_new = MatrixFileHandler(pFileType='cool', pEnforceInteger=True) fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) fh_new.matrixFile.fileWasH5 = True # and save it. fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile, pApplyCorrectionCoolerLoad=False) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load( ) # pMatrixFile = ROOT + 'Li_et_al_2015.h5' # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) # assert fh is not None # load data # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # instances, features = matrix.nonzero() # instances_factors = correction_factors[instances] # features_factors = correction_factors[features] # instances_factors *= features_factors # matrix_applied_correction = matrix.data / instances_factors # mask = matrix.data == 0 matrix.data = np.rint(matrix.data) matrix.eliminate_zeros() # matrix_test.eliminate_zeros() nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0) nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test) # os.unlink(cool_outfile) os.unlink(cool_outfile)
def test_save_scool_matrixHandlersCool(): outfile = NamedTemporaryFile(suffix='.scool', prefix='hicmatrix_scool_test') pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() matrixFileHandlerOutput1 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell1', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput1.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput2 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell2', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput2.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput3 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell3', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput3.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.coolObjectsList = [ matrixFileHandlerOutput1, matrixFileHandlerOutput2, matrixFileHandlerOutput3 ] matrixFileHandler.save(outfile.name, pSymmetric=True, pApplyCorrection=False) content_of_scool = cooler.fileops.list_scool_cells(outfile.name) content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3'] for content in content_expected: assert content in content_of_scool
def test_save_homer(): homer_outfile = outfile + '.homer' # create matrixFileHandler instance with filetype 'homer' pMatrixFile = ROOT + 'test_matrix.homer' fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # set matrix variables fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # noqa E501 # and save it. fh.save(pName=homer_outfile, pSymmetric=False, pApplyCorrection=False) # not implemented os.unlink(homer_outfile)
def compute_sum(pMatrixName, pMatricesList, pThread, pQueue): sum_list = [] for i, matrix in enumerate(pMatricesList): matrixFileHandler = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandler.load() # try: sum_of_matrix = _matrix.sum() # except: # sum_list.append() sum_list.append(sum_of_matrix) pQueue.put(sum_list)
def test_load_h5_save_cool(): cool_outfile = outfile + '.cool' # create matrixFileHandler instance with filetype 'h5' pMatrixFile = ROOT + 'Li_et_al_2015.h5' fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) # set matrix variables fh_new = MatrixFileHandler(pFileType='cool') fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) fh_new.matrixFile.fileWasH5 = True # and save it. fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load( ) instances, features = matrix.nonzero() instances_factors = correction_factors[instances] features_factors = correction_factors[features] instances_factors *= features_factors matrix_applied_correction = matrix.data / instances_factors nt.assert_almost_equal(matrix_applied_correction, matrix_test.data, decimal=1) nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test) correction_factors = 1 / correction_factors mask = np.isnan(correction_factors) correction_factors[mask] = 0 mask = np.isinf(correction_factors) correction_factors[mask] = 0 nt.assert_equal(correction_factors, correction_factors_test) # os.unlink(cool_outfile) os.unlink(cool_outfile)
def test_save_h5(): h5_outfile = outfile + '.h5' # create matrixFileHandler instance with filetype 'h5' pMatrixFile = ROOT + 'Li_et_al_2015.h5' fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # set matrix variables fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # noqa E501 # and save it. fh.save(h5_outfile, True, None) os.unlink(h5_outfile)
def compute_normalize(pMatrixName, pMatricesList, pArgminSum, pSumOfAll, pAppend, pQueue): matrixFileHandlerList = [] for i, matrix in enumerate(pMatricesList): if i == 0 and pAppend: append = False else: append = True matrixFileHandler = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandler.load() _matrix.data = _matrix.data.astype(np.float32) mask = np.isnan(_matrix.data) _matrix.data[mask] = 0 mask = np.isinf(_matrix.data) _matrix.data[mask] = 0 adjust_factor = pSumOfAll[i] / pArgminSum _matrix.data /= adjust_factor mask = np.isnan(_matrix.data) mask = np.isnan(_matrix.data) _matrix.data[mask] = 0 mask = np.isinf(_matrix.data) _matrix.data[mask] = 0 _matrix.eliminate_zeros() matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pAppend=append, pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput.set_matrix_variables(_matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerList.append(matrixFileHandlerOutput) pQueue.put(matrixFileHandlerList)
def test_hicConvertFormat_2D_text_to_cool(): outfile = NamedTemporaryFile(suffix='.cool', delete=False) outfile.close() text_2d = ROOT + '/GSM1436265_RAD21_ENCFF002EMQ.txt' args = "--matrices {} --outFileName {} --inputFormat 2D-text --outputFormat cool -r 10000 --chromosomeSizes {}".format( text_2d, outfile.name, ROOT + '/hg19.chrom.sizes').split() compute(hicConvertFormat.main, args, 5) new = hm.hiCMatrix(outfile.name) matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=ROOT + '/2dtexttocool.cool') _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() new.matrix = triu(new.matrix) nt.assert_array_almost_equal(new.matrix.data, _matrix.data, decimal=0)
def test_load_cool2(capsys): # create matrixFileHandler instance with filetype 'cool' pMatrixFile = ROOT + 'one_interaction_4chr.cool' # The interaction is: # chr1 10000 chr1 200000 bin_size = 50000 # So there should be a 1 between the bin 0 and the bin 3 fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) # test data nt.assert_almost_equal(matrix.data, np.array([1])) # test matrix test_matrix = np.array([[0 for i in range(9167)]]) nt.assert_almost_equal(matrix[3].todense(), test_matrix) test_matrix[0][3] = 1 nt.assert_almost_equal(matrix[0].todense(), test_matrix) test_cut_intervals = sum( [[('chr1', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3909)], [('chr1', 195450000, 195471971, 1.0)], [('chrX', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3420)], [('chrX', 171000000, 171031299, 1.0)], [('chrY', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(1834)], [('chrY', 91700000, 91744698, 1.0)], [('chrM', 0, 16299, 1.0)]], []) for index, tup in enumerate(cut_intervals): for ind, element in enumerate(tup): assert element == test_cut_intervals[index][ind] test_nan_bins = [0, 1, 2, 4] nt.assert_almost_equal(nan_bins[:4], test_nan_bins) assert distance_counts is None assert correction_factors is None
def test_hicConvertFormat_hicpro_to_cool(): outfile = NamedTemporaryFile(suffix='.cool', delete=False) outfile.close() hicprofile = ROOT + '/test_matrix.hicpro' bedfile = ROOT + '/test_matrix.bed' args = "--matrices {} --outFileName {} --inputFormat hicpro --outputFormat cool --bedFileHicpro {}".format( hicprofile, outfile.name, bedfile).split() compute(hicConvertFormat.main, args, 5) new = hm.hiCMatrix(outfile.name) matrixFileHandlerInput = MatrixFileHandler(pFileType='hicpro', pMatrixFile=hicprofile, pBedFileHicPro=bedfile) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() new.matrix = triu(new.matrix) nt.assert_array_almost_equal(new.matrix.data, _matrix.data, decimal=0)
def test_load_homer(capsys): # create matrixFileHandler instance with filetype 'homer' pMatrixFile = ROOT + 'test_matrix.homer' fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # create test matrix test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315, 0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]]) nt.assert_almost_equal(matrix[0].todense(), test_matrix) test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)] # noqa E501 nt.assert_equal(cut_intervals, test_cut_intervals) assert nan_bins is None assert distance_counts is None assert correction_factors is None
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) matrix_file_handler_object_list = [] matrices_list = cell_name_list(args.matrix) if args.action in ['extractToCool', 'extractScool']: if args.cellList is not None: matrix_list_tmp = [] with open(args.cellList, 'r') as file: for line in file: values = line.strip() log.debug('values {}'.format(values)) if not values.startswith('/cells'): values = '/cells/' + values if values in matrices_list: matrix_list_tmp.append(values) matrices_list = matrix_list_tmp if len(matrices_list) == 0: raise OSError('No cells for processing. Terminating.') exit(1) if len(matrices_list) < args.threads: args.threads = len(matrices_list) matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=args.matrix + "::" + matrices_list[0]) _matrix, cut_intervals_all, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() threads = args.threads matrixFileHandler_list = [None] * args.threads process = [None] * args.threads queue = [None] * args.threads thread_done = [False] * args.threads matricesPerThread = len(matrices_list) // threads for i in range(args.threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=load_cool_files, kwargs=dict(pMatrixName=args.matrix, pMatricesList=matrices_name_list, pCutIntervals=cut_intervals_all, pQueue=queue[i])) process[i].start() all_data_collected = False fail_flag = False fail_message = '' while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): matrixFileHandler_list[i] = queue[i].get() if 'Fail:' in matrixFileHandler_list[i]: fail_flag = True fail_message = matrixFileHandler_list[i][6:] queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if fail_flag: log.error(fail_message) exit(1) matrix_file_handler_object_list = [ item for sublist in matrixFileHandler_list for item in sublist ] if args.action in ['extractScool', 'update']: matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False) else: if not os.path.exists(args.outFileName): try: os.makedirs(args.outFileName) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for matrixFileHandler in matrix_file_handler_object_list: matrixFileHandler.save( args.outFileName + '/' + matrixFileHandler.matrixFile.matrixFileName + '.cool', pApplyCorrection=True, pSymmetric=True)
def main(args=None): args = parse_arguments().parse_args(args) threads = args.threads matrixFileHandler_list = [None] * threads matrices_list = cell_name_list(args.matrix) if len(matrices_list) < threads: threads = len(matrices_list) matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=args.matrix + "::" + matrices_list[0]) _matrix, cut_intervals_all, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads print('Threads: ' + str(threads)) for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_correction, kwargs=dict( pMatrixName=args.matrix, pMatrixList=matrices_name_list, pCutIntervals=cut_intervals_all, pQueue=queue[i] ) ) process[i].start() fail_flag = False while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): matrixFileHandler_list[i] = queue[i].get() # csr_matrix_worker = queue[i].get() if isinstance(matrixFileHandler_list[i], str): log.error('{}'.format(matrixFileHandler_list[i])) fail_flag = True queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if fail_flag: exit(1) matrix_file_handler_object_list = [item for sublist in matrixFileHandler_list for item in sublist] matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!' ) exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: for resolution in args.resolutions: out_name = args.outFileName[i].split('.') out_name[-2] = out_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']: format_was_h5 = False if args.inputFormat == 'h5': format_was_h5 = True applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] applyCorrectionCoolerLoad = True if args.load_raw_values: applyCorrectionCoolerLoad = False matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer, pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: if args.outputFormat in ['homer', 'ginteractions']: # make it a upper triangular matrix in case it is not already _matrix = triu(_matrix) # make it a full symmetrical matrix _matrix = _matrix.maximum(_matrix.T) hic2CoolVersion = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pEnforceInteger=args.enforce_integer, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save(args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define one matrix and many resolutions which should be created or multiple matrices.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution.' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() for j, resolution in enumerate(args.resolutions): hic_matrix_res = deepcopy(hic_matrix) _mergeFactor = int(resolution) // bin_size log.debug('bin size {}'.format(bin_size)) log.debug('_mergeFactor {}'.format(_mergeFactor)) if int(resolution) != bin_size: merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix_res, _mergeFactor) else: merged_matrix = hic_matrix_res append = False if j > 0: append = True matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer, pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: append = False if i > 0: append = True hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!' ) exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: out_name = args.outFileName[i].split('.') out_name[-2] = split_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) for resolution in args.resolutions: hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']: applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer) _matrix, cut_intervals, nan_bins, \ correction_factors, distance_counts = matrixFileHandlerInput.load() log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save(args.outFileName[i] + '.' + args.outputFormat, pSymmetric=True, pApplyCorrection=applyCorrection) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define either one matrix and many resolutions which should be created.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() for resolution in args.resolutions: _mergeFactor = int(resolution) // bin_size merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix, _mergeFactor) matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '.mcool' + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool') matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '.mcool' + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!: Input matrices {}; output matrices {}' .format(len(args.matrices), len(args.outFileName))) exit(1) if args.inputFormat == 'hic' and args.outputFormat != 'cool': log.error('The export of a hic file is only possible to a cool file.') exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: for resolution in args.resolutions: out_name = args.outFileName[i].split('.') out_name[-2] = out_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool', '2D-text']: format_was_h5 = False if args.inputFormat == 'h5': format_was_h5 = True applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) if args.inputFormat == '2D-text': if args.resolutions is None: log.error('The resolution must be defined via --resolutions') sys.exit(1) if args.chromosomeSizes is None: log.error( 'The sizes of the chromosomes must be defined via --chromosomeSizes.' ) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() elif args.inputFormat == '2D-text': chrom_sizes = OrderedDict() size_genome = 0 with open(args.chromosomeSizes.name, 'r') as file: file_ = True while file_: file_ = file.readline().strip() if file_ != '': line_split = file_.split('\t') chrom_sizes[line_split[0]] = int(line_split[1]) size_genome += int(line_split[1]) chrom_sizes = list(chrom_sizes.items()) # log.debug('chrom_sizes: {}'.format(chrom_sizes)) args.resolutions = [int(x) for x in args.resolutions] # internal_matrix_size = size_genome // args.resolutions[0] cut_intervals = [] for chromosome in chrom_sizes: for interval in range(0, chromosome[1], args.resolutions[0]): cut_intervals.append( tuple([ chromosome[0], interval, min(chromosome[1], interval + args.resolutions[0]), 1.0 ])) hic_matrix_csr = lil_matrix( (len(cut_intervals), len(cut_intervals))) log.debug('cut_intervals {}'.format(cut_intervals[:20])) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(hic_matrix_csr, cut_intervals) # tmp_matrix = coo_matrix(()) with open(matrix, 'r') as file: for j, line in enumerate(file): line_split = line.split('\t') chromosome_1 = str(line_split[0]) start_1 = int(line_split[1]) end_1 = int(line_split[2]) chromosome_2 = str(line_split[3]) start_2 = int(line_split[4]) end_2 = int(line_split[5]) value = float(line_split[6]) bin_id_1 = hic_matrix.getRegionBinRange( chromosome_1, start_1, end_1) bin_id_2 = hic_matrix.getRegionBinRange( chromosome_2, start_2, end_2) try: hic_matrix.matrix[bin_id_1, bin_id_2] = value except Exception as exp: log.debug(str(exp)) if j % 1000 == 0: log.debug('{} lines computed'.format(j)) log.debug('csr with values filled!') hic_matrix.matrix = hic_matrix.matrix.tocsr() _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, \ hic_matrix.distance_counts, hic_matrix.correction_factors else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] applyCorrectionCoolerLoad = True if args.load_raw_values: applyCorrectionCoolerLoad = False matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer, pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() log.debug('cut_intervals {}'.format(cut_intervals[:20])) log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: log.debug('cool h5 homer ginteractions hicpro branch') if args.outputFormat in ['homer', 'ginteractions']: log.debug('homer ginteractions branch') # make it a upper triangular matrix in case it is not already _matrix = triu(_matrix) # make it a full symmetrical matrix _matrix = _matrix.maximum(_matrix.T) hic2CoolVersion = None cool_metadata = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata log.debug('cool_metadata {}'.format(cool_metadata)) matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pEnforceInteger=args.enforce_integer, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion, pHiCInfo=cool_metadata) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) log.debug('len(args.outFileName) {}, i {}'.format( len(args.outFileName), i)) matrixFileHandlerOutput.save(args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) if args.outputFormat == 'hicpro': log.debug('hicpro branch') if len(args.matrices) == len(args.outFileName) and len( args.outFileName) == len(args.bedFileHicpro): log.debug('args.bedFileHicpro[i] {}'.format( args.bedFileHicpro[i])) matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pBedFileHicPro=args.bedFileHicpro[i]) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) else: log.error( 'The number of input matrices, output files and bed files does not match: Input: {}; Output: {}; Bed: {}' .format(len(args.matrix), len(args.outFileName), len(args.bedFileHicpro))) exit(1) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define one matrix and many resolutions which should be created or multiple matrices.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution.' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() hic2CoolVersion = None cool_metadata = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata for j, resolution in enumerate(args.resolutions): hic_matrix_res = deepcopy(hic_matrix) _mergeFactor = int(resolution) // bin_size log.debug('bin size {}'.format(bin_size)) log.debug('_mergeFactor {}'.format(_mergeFactor)) if int(resolution) != bin_size: merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix_res, _mergeFactor) else: merged_matrix = hic_matrix_res append = False if j > 0: append = True matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer, pAppend=append, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion, pHiCInfo=cool_metadata) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: append = False if i > 0: append = True hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)