Example #1
0
def test_load_cool(capsys):
    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'Li_et_al_2015.cool'
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()

    # test matrix
    test_matrix = np.array([[0. for i in range(11104)]])
    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    test_cut_intervals = [('X', 0, 2200, 1.0), ('X', 2200, 4702, 1.0), ('X', 4702, 7060, 1.0),
                          ('X', 7060, 8811, 1.0), ('X', 8811, 11048, 1.0), ('X', 11048, 14329, 1.0),
                          ('X', 14329, 16847, 1.0), ('X', 16847, 19537, 1.0), ('X', 19537, 20701, 1.0),
                          ('X', 20701, 22321, 1.0), ('X', 22321, 24083, 1.0), ('X', 24083, 25983, 1.0),
                          ('X', 25983, 27619, 1.0), ('X', 27619, 29733, 1.0), ('X', 29733, 30973, 1.0),
                          ('X', 30973, 32214, 1.0), ('X', 32214, 34179, 1.0), ('X', 34179, 35987, 1.0),
                          ('X', 35987, 37598, 1.0), ('X', 37598, 39009, 1.0)]
    for index, tup in enumerate(cut_intervals[0:20]):
        for ind, element in enumerate(tup):
            assert element == test_cut_intervals[index][ind]

    test_nan_bins = [0, 1, 2, 3, 4, 5, 6, 7, 30, 31]
    nt.assert_almost_equal(nan_bins[0:10], test_nan_bins)

    test_correction_factors = [0., 0., 0., 0., 0., 0., 0., 0., 1.1022922, 0.796711]
    nt.assert_almost_equal(correction_factors[0:10], test_correction_factors)

    assert distance_counts is None
Example #2
0
def test_hicConvertFormat_h5_to_homer():

    outfile = NamedTemporaryFile(suffix='.homer', delete=False)
    outfile.close()

    args = "--matrices {} --outFileName {} --inputFormat cool --outputFormat homer ".format(
        original_matrix_cool_chr4, outfile.name).split()
    # hicConvertFormat.main(args)
    compute(hicConvertFormat.main, args, 5)

    test = hm.hiCMatrix(original_matrix_cool_chr4)
    f = gzip.open(outfile.name, 'rb')
    file_content = f.read()
    outfile2 = NamedTemporaryFile(suffix='.homer', delete=False)
    outfile2.close()
    with open(outfile2.name, 'wb') as matrix_file:
        matrix_file.write(file_content)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='homer',
                                               pMatrixFile=outfile2.name)

    _matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    nt.assert_array_almost_equal(test.matrix.data, _matrix.data, decimal=0)
Example #3
0
def compute_merge(pMatrixName, pMatrixList, pRunningWindow, pNumBins, pQueue):

    out_queue_list = []
    try:
        for matrix in pMatrixList:
            hic = hm.hiCMatrix(pMatrixName + '::' + matrix)

            if pRunningWindow:
                merged_matrix = running_window_merge(hic, pNumBins)
            else:
                merged_matrix = merge_bins(hic, pNumBins)

            matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                        pMatrixFile=matrix,
                                                        pEnforceInteger=False,
                                                        pFileWasH5=False)

            matrixFileHandlerOutput.set_matrix_variables(
                merged_matrix.matrix, merged_matrix.cut_intervals,
                merged_matrix.nan_bins, merged_matrix.correction_factors,
                merged_matrix.distance_counts)
            out_queue_list.append(matrixFileHandlerOutput)

        pQueue.put(out_queue_list)
    except Exception as exp:
        pQueue.put(["Fail: {}".format(str(exp))])
    return
def test_load_cool_hic2cool_versions():
    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool'
    hic2cool_042 = MatrixFileHandler(pFileType='cool',
                                     pMatrixFile=pMatrixFile,
                                     pCorrectionFactorTable='KR',
                                     pCorrectionOperator='*')
    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
    hic2cool_051 = MatrixFileHandler(pFileType='cool',
                                     pMatrixFile=pMatrixFile,
                                     pCorrectionFactorTable='KR')

    # hic2cool_051 = MatrixFileHandler(pFileType='h5', pMatrixFile=, pCorrectionFactorTable='KR')
    # hic2cool_042 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool')
    # hic2cool_051 = hm.hiCMatrix(ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool')

    # hic2cool_041 = hm.hiCMatrix(outfile.name)
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = hic2cool_042.load(
    )
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = hic2cool_051.load(
    )

    nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
def test_save_scool_pixeltables():
    outfile = NamedTemporaryFile(suffix='.scool',
                                 prefix='hicmatrix_scool_test')

    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'

    cooler_obj = cooler.Cooler(pMatrixFile)
    bins = cooler_obj.bins()[:]
    pixels = cooler_obj.pixels()[:]

    pixelsList = [pixels, pixels, pixels]
    matrices_list = ['cell1', 'cell2', 'cell3']
    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = None
    matrixFileHandler.matrixFile.bins = bins
    matrixFileHandler.matrixFile.pixel_list = pixelsList
    matrixFileHandler.matrixFile.name_list = matrices_list
    matrixFileHandler.save(outfile.name,
                           pSymmetric=True,
                           pApplyCorrection=False)

    content_of_scool = cooler.fileops.list_scool_cells(outfile.name)
    content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
    for content in content_expected:
        assert content in content_of_scool
Example #6
0
def compute_read_coverage_sparsity(pMatrixName, pMatricesList, pXDimension, pMaximumRegionToConsider, pQueue):
    read_coverage = []
    sparsity = []

    log.debug('read covarage and sparsity')
    hic_ma = hm.hiCMatrix(pMatrixFile=pMatrixName + '::' + pMatricesList[0])
    bin_size = hic_ma.getBinSize()
    shape_x = hic_ma.matrix.shape[0]
    for i, matrix in enumerate(pMatricesList):

        matrixFileHandler = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix, pLoadMatrixOnly=True)
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandler.load()
        max_distance = pMaximumRegionToConsider // bin_size
        instances = _matrix[0]
        features = _matrix[1]

        distances = np.absolute(instances - features)
        mask = distances <= max_distance
        sparsity_length = len(_matrix[2][mask])

        sparsity.append(sparsity_length / (shape_x * max_distance))

        # only upper half is loaded --> times 2
        read_coverage_sum = _matrix[2].sum() * 2
        # minus the double main diagonal
        mask = distances == 0
        read_coverage_sum -= _matrix[2][mask].sum()
        read_coverage.append(read_coverage_sum)

    pQueue.put([read_coverage, sparsity])
Example #7
0
def test_load_hicpro(capsys):
    # create matrixFileHandler instance with filetype 'hicpro'
    pMatrixFile = ROOT + 'test_matrix.hicpro'
    pBedFileHicPro = ROOT + 'test_matrix.bed'
    fh = MatrixFileHandler(pFileType='hicpro', pMatrixFile=pMatrixFile, pBedFileHicPro=pBedFileHicPro)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()

    # create test matrix
    test_list = [0. for i in range(3113)]
    test_list.insert(0, 41.345793)
    test_list[827] = 5.42079
    test_list[1263] = 5.122642

    test_matrix = np.array([test_list])

    # and check for shape and values
    assert matrix[0].todense().shape == test_matrix.shape
    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    test_cut_intervals = np.array([('chr1', 0, 1000000, 1), ('chr1', 1000000, 2000000, 2), ('chr1', 2000000, 3000000, 3),
                                   ('chr1', 3000000, 4000000, 4), ('chr1', 4000000, 5000000, 5), ('chr1', 5000000, 6000000, 6),
                                   ('chr1', 6000000, 7000000, 7), ('chr1', 7000000, 8000000, 8), ('chr1', 8000000, 9000000, 9),
                                   ('chr1', 9000000, 10000000, 10), ('chr1', 10000000, 11000000, 11), ('chr1', 11000000, 12000000, 12),
                                   ('chr1', 12000000, 13000000, 13), ('chr1', 13000000, 14000000, 14), ('chr1', 14000000, 15000000, 15),
                                   ('chr1', 15000000, 16000000, 16), ('chr1', 16000000, 17000000, 17), ('chr1', 17000000, 18000000, 18),
                                   ('chr1', 18000000, 19000000, 19), ('chr1', 19000000, 20000000, 20)])
    nt.assert_equal(cut_intervals[0:20], test_cut_intervals)

    assert nan_bins is None
    assert correction_factors is None
    assert distance_counts is None
Example #8
0
def test_load_h5(capsys):
    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    test_matrix = np.array([[0. for i in range(11104)]])
    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    nt.assert_equal(cut_intervals[0], ('X', 0, 2200, 0.0))
    nt.assert_equal(cut_intervals[1], ('X', 2200, 4702, 0.0))
    nt.assert_equal(cut_intervals[2], ('X', 4702, 7060, 0.0))
    nt.assert_equal(cut_intervals[3], ('X', 7060, 8811, 0.4))

    test_nan_bins = np.array([
        0, 1, 2, 3, 4, 5, 6, 7, 30, 31, 32, 51, 52, 53, 54, 81, 82, 83, 84, 94
    ])  # noqa E501
    nt.assert_equal(nan_bins[0:20], test_nan_bins)

    assert distance_counts is None

    test_correction_factors = np.array(
        [0, 0, 0, 0, 0, 0, 0, 0, 0.90720049, 1.25516028])  # noqa E501
    nt.assert_almost_equal(correction_factors[0:10], test_correction_factors)
Example #9
0
def test_save_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'Li_et_al_2015.cool'
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins,
                            correction_factors, distance_counts)
    # and save it.
    fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    nt.assert_equal(matrix.data, matrix_test.data)
    nt.assert_equal(cut_intervals, cut_intervals_test)
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    nt.assert_equal(correction_factors, correction_factors_test)

    os.unlink(cool_outfile)
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pClusterName,
                             pQueue):
    counter = 0
    consensus_matrix = None
    try:
        matrixFileHandlerInput = MatrixFileHandler(
            pFileType='cool',
            pMatrixFile=pMatrixName + '::' + pClusterMatricesList[0])
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandlerInput.load()
        consensus_matrix = _matrix

        for j, matrix in enumerate(pClusterMatricesList[1:]):

            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool',
                pMatrixFile=pMatrixName + '::' + matrix,
                pLoadMatrixOnly=True)
            _matrix, _, _, _, _ = matrixFileHandlerInput.load()

            _matrix = csr_matrix((_matrix[2], (_matrix[0], _matrix[1])),
                                 (_matrix[3], _matrix[3]),
                                 dtype=np.float)

            if consensus_matrix is None:
                consensus_matrix = _matrix
            else:
                consensus_matrix += _matrix

        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
        matrixFileHandlerOutput = MatrixFileHandler(
            pFileType='cool',
            pMatrixFile='consensus_matrix_cluster_' + str(pClusterName) + ':' +
            str(len(pClusterMatricesList)),
            pEnforceInteger=False,
            pFileWasH5=False,
            pHic2CoolVersion=hic2CoolVersion)

        matrixFileHandlerOutput.set_matrix_variables(consensus_matrix,
                                                     cut_intervals, nan_bins,
                                                     correction_factors,
                                                     distance_counts)
        if counter > 0:
            log.info(
                '{} matrices were not considered because of a wrong size.'.
                format(counter))
    except Exception as exp:
        log.debug('exception! {}'.format(str(exp)))
    log.debug('computaiton of {} done'.format(str(pClusterName)))
    pQueue.put(matrixFileHandlerOutput)
Example #11
0
def compute_sum(pMatrixName, pMatricesList, pThread, pQueue):
    sum_list = []
    for i, matrix in enumerate(pMatricesList):

        matrixFileHandler = MatrixFileHandler(pFileType='cool',
                                              pMatrixFile=pMatrixName + '::' +
                                              matrix)
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandler.load()
        # try:
        sum_of_matrix = _matrix.sum()
        # except:
        # sum_list.append()
        sum_list.append(sum_of_matrix)
    pQueue.put(sum_list)
Example #12
0
def txt_to_matrixFileHandler(pMatricesList, pMatrixDimensions, pCutIntervals,
                             pQueue):

    matrixFileHandlerList = []

    for i, matrix in enumerate(pMatricesList):

        # create csr matrix
        instances = []
        features = []
        data = []
        with open(matrix, 'r') as file:
            for i, line in enumerate(file.readlines()):
                line = line.strip()
                if len(line) == 0:
                    continue
                x, y, count = line.split('\t')[:3]
                instances.append(int(x))
                features.append(int(y))
                data.append(float(count))

        cell_type = matrix.split('_')[2]

        log.debug('matrix name {}'.format(matrix))

        log.debug(
            'max(instances) {} max(features) {} pMatrixDimensions {}'.format(
                max(instances), max(features), pMatrixDimensions))
        hic_matrix = csr_matrix((data, (instances, features)),
                                (pMatrixDimensions, pMatrixDimensions),
                                dtype=np.float)

        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pMatrixFile=matrix)

        matrixFileHandlerOutput.set_matrix_variables(hic_matrix, pCutIntervals,
                                                     None, None, None)

        if matrixFileHandlerOutput.matrixFile.hic_metadata is None:
            matrixFileHandlerOutput.matrixFile.hic_metadata = {}
            matrixFileHandlerOutput.matrixFile.hic_metadata[
                'cell_type'] = cell_type

        matrixFileHandlerList.append(matrixFileHandlerOutput)

    pQueue.put(matrixFileHandlerList)
Example #13
0
def load_cool_files(pMatrixName, pMatricesList, pCutIntervals, pQueue):

    matrixFileHandlerList = []
    try:
        for i, matrix in enumerate(pMatricesList):

            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool',
                pMatrixFile=pMatrixName + "::" + matrix,
                pNoCutIntervals=True)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            matrixFileHandlerOutput = MatrixFileHandler(
                pFileType='cool', pMatrixFile=matrix.split('/')[-1])

            matrixFileHandlerOutput.set_matrix_variables(
                _matrix, pCutIntervals, nan_bins, correction_factors,
                distance_counts)

            matrixFileHandlerList.append(matrixFileHandlerOutput)
    except Exception as exp:
        pQueue.put('Fail: ' + str(exp) + traceback.format_exc())
        return
    pQueue.put(matrixFileHandlerList)
Example #14
0
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pAppend,
                             pQueue):
    cluster_consensus_matrices_list = []
    for i, cluster in enumerate(pClusterMatricesList):
        consensus_matrix = None
        if i == 0 and pAppend:
            append = False
        else:
            append = True
        for matrix in cluster:
            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix)
            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            if consensus_matrix is None:
                consensus_matrix = _matrix
            else:
                consensus_matrix += _matrix

        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
        matrixFileHandlerOutput = MatrixFileHandler(
            pFileType='cool',
            pAppend=append,
            pEnforceInteger=False,
            pFileWasH5=False,
            pHic2CoolVersion=hic2CoolVersion)

        matrixFileHandlerOutput.set_matrix_variables(consensus_matrix,
                                                     cut_intervals, nan_bins,
                                                     correction_factors,
                                                     distance_counts)
        cluster_consensus_matrices_list.append(matrixFileHandlerOutput)

    pQueue.put(cluster_consensus_matrices_list)
Example #15
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)
    matrix_file_handler_object_list = []
    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=args.matrices[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    matrices_list = args.matrices

    threads = args.threads

    matrixFileHandler_list = [None] * args.threads
    process = [None] * args.threads
    queue = [None] * args.threads

    thread_done = [False] * args.threads
    matricesPerThread = len(matrices_list) // threads

    for i in range(args.threads):
        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=load_cool_files,
                             kwargs=dict(pMatricesList=matrices_name_list,
                                         pCutIntervals=cut_intervals_all,
                                         pQueue=queue[i]))
        process[i].start()

    all_data_collected = False
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    matrix_file_handler_object_list = [
        item for sublist in matrixFileHandler_list for item in sublist
    ]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
Example #16
0
def load_cool_files(pMatricesList, pCutIntervals, pQueue):

    matrixFileHandlerList = []
    for i, matrix in enumerate(pMatricesList):
        try:
            matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                                       pMatrixFile=matrix,
                                                       pNoCutIntervals=True)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                        pMatrixFile=matrix)

            matrixFileHandlerOutput.set_matrix_variables(
                _matrix, pCutIntervals, nan_bins, correction_factors,
                distance_counts)

            matrixFileHandlerList.append(matrixFileHandlerOutput)
        except Exception as exp:
            log.warning(
                'File could not be opend and is excluded: {}. Error message: {} '
                .format(matrix, str(exp)))

    pQueue.put(matrixFileHandlerList)
Example #17
0
def compute_correction(pMatrixName, pMatrixList, pCutIntervals, pQueue):

    out_queue_list = []

    print('len(pMatrixList): ' + str(len(pMatrixList)))
    try:
        for i, matrix in enumerate(pMatrixList):

            pixels, shape, _ = load_matrix(pMatrixName + '::' + matrix, None, False, None)

            # _matrix = [None, None, None]
            if 'bin1_id' in pixels.columns and 'bin2_id' in pixels.columns and 'count' in pixels.columns:
                instances = pixels['bin1_id'].values
                features = pixels['bin2_id'].values
                data = pixels['count'].values

                matrix = csr_matrix((data, (instances, features)), (shape[0], shape[1]), dtype=np.float)
            else:
                continue

            kr = kr_balancing(shape[0], shape[1],
                              matrix.count_nonzero(), matrix.indptr.astype(np.int64, copy=False),
                              matrix.indices.astype(np.int64, copy=False), matrix.data.astype(np.float64, copy=False))
            kr.computeKR()
            correction_factors = kr.get_normalisation_vector(False).todense()

            matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix)

            matrixFileHandlerOutput.set_matrix_variables(matrix,
                                                         pCutIntervals,
                                                         None,
                                                         correction_factors,
                                                         None)

            out_queue_list.append(matrixFileHandlerOutput)
            print('DOne i: ' + str(i))
    except Exception as exp:
        print('Exception: ' + str(exp))
        log.debug('Exception! {}'.format(str(exp)))
        pQueue.put(str(exp))
        return

    pQueue.put(out_queue_list)
    return
Example #18
0
def test_hicConvertFormat_2D_text_to_cool():

    outfile = NamedTemporaryFile(suffix='.cool', delete=False)
    outfile.close()
    text_2d = ROOT + '/GSM1436265_RAD21_ENCFF002EMQ.txt'
    args = "--matrices {} --outFileName {} --inputFormat 2D-text --outputFormat cool -r 10000 --chromosomeSizes {}".format(
        text_2d, outfile.name, ROOT + '/hg19.chrom.sizes').split()
    compute(hicConvertFormat.main, args, 5)

    new = hm.hiCMatrix(outfile.name)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=ROOT +
                                               '/2dtexttocool.cool')

    _matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    new.matrix = triu(new.matrix)
    nt.assert_array_almost_equal(new.matrix.data, _matrix.data, decimal=0)
Example #19
0
def test_load_cool2(capsys):
    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'one_interaction_4chr.cool'
    # The interaction is:
    # chr1	10000	chr1	200000
    bin_size = 50000
    # So there should be a 1 between the bin 0 and the bin 3
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # test data
    nt.assert_almost_equal(matrix.data, np.array([1]))

    # test matrix
    test_matrix = np.array([[0 for i in range(9167)]])
    nt.assert_almost_equal(matrix[3].todense(), test_matrix)
    test_matrix[0][3] = 1
    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    test_cut_intervals = sum(
        [[('chr1', i * bin_size, (i + 1) * bin_size, 1.0)
          for i in range(3909)], [('chr1', 195450000, 195471971, 1.0)],
         [('chrX', i * bin_size, (i + 1) * bin_size, 1.0)
          for i in range(3420)], [('chrX', 171000000, 171031299, 1.0)],
         [('chrY', i * bin_size, (i + 1) * bin_size, 1.0)
          for i in range(1834)], [('chrY', 91700000, 91744698, 1.0)],
         [('chrM', 0, 16299, 1.0)]], [])

    for index, tup in enumerate(cut_intervals):
        for ind, element in enumerate(tup):
            assert element == test_cut_intervals[index][ind]

    test_nan_bins = [0, 1, 2, 4]
    nt.assert_almost_equal(nan_bins[:4], test_nan_bins)

    assert distance_counts is None
    assert correction_factors is None
Example #20
0
def test_hicConvertFormat_hicpro_to_cool():

    outfile = NamedTemporaryFile(suffix='.cool', delete=False)
    outfile.close()
    hicprofile = ROOT + '/test_matrix.hicpro'
    bedfile = ROOT + '/test_matrix.bed'
    args = "--matrices {} --outFileName {} --inputFormat hicpro --outputFormat cool --bedFileHicpro {}".format(
        hicprofile, outfile.name, bedfile).split()
    compute(hicConvertFormat.main, args, 5)

    new = hm.hiCMatrix(outfile.name)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='hicpro',
                                               pMatrixFile=hicprofile,
                                               pBedFileHicPro=bedfile)

    _matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    new.matrix = triu(new.matrix)
    nt.assert_array_almost_equal(new.matrix.data, _matrix.data, decimal=0)
def test_load_cool_matrix_only():

    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=pMatrixFile,
                                               pLoadMatrixOnly=True)
    matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    assert len(matrix) == 4
    assert cut_intervals is None
    assert nan_bins is None
    assert distance_counts is None
    assert correction_factors is None

    matrixFileHandlerInput2 = MatrixFileHandler(pFileType='cool',
                                                pMatrixFile=pMatrixFile)
    matrix2, cut_intervals2, nan_bins2, \
        distance_counts2, correction_factors2 = matrixFileHandlerInput2.load()

    instances, features = matrix2.nonzero()
    nt.assert_almost_equal(matrix[0], instances, decimal=1)
    nt.assert_almost_equal(matrix[1], features, decimal=1)
    nt.assert_almost_equal(matrix[2], matrix2.data, decimal=1)
    assert matrix[3] == matrix2.shape[0]
Example #22
0
def test_load_homer(capsys):
    # create matrixFileHandler instance with filetype 'homer'
    pMatrixFile = ROOT + 'test_matrix.homer'
    fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()

    # create test matrix

    test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315,
                             0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]])

    nt.assert_almost_equal(matrix[0].todense(), test_matrix)

    test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)]  # noqa E501
    nt.assert_equal(cut_intervals, test_cut_intervals)

    assert nan_bins is None
    assert distance_counts is None
    assert correction_factors is None
Example #23
0
def test_save_homer():
    homer_outfile = outfile + '.homer'

    # create matrixFileHandler instance with filetype 'homer'
    pMatrixFile = ROOT + 'test_matrix.homer'
    fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)  # noqa E501
    # and save it.
    fh.save(pName=homer_outfile, pSymmetric=False, pApplyCorrection=False)  # not implemented
    os.unlink(homer_outfile)
Example #24
0
def test_save_h5():
    h5_outfile = outfile + '.h5'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)  # noqa E501
    # and save it.
    fh.save(h5_outfile, True, None)

    os.unlink(h5_outfile)
Example #25
0
def compute_normalize(pMatrixName, pMatricesList, pArgminSum, pSumOfAll,
                      pAppend, pQueue):

    matrixFileHandlerList = []
    for i, matrix in enumerate(pMatricesList):
        if i == 0 and pAppend:
            append = False
        else:
            append = True
        matrixFileHandler = MatrixFileHandler(pFileType='cool',
                                              pMatrixFile=pMatrixName + '::' +
                                              matrix)
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandler.load()
        _matrix.data = _matrix.data.astype(np.float32)
        mask = np.isnan(_matrix.data)
        _matrix.data[mask] = 0

        mask = np.isinf(_matrix.data)
        _matrix.data[mask] = 0
        adjust_factor = pSumOfAll[i] / pArgminSum
        _matrix.data /= adjust_factor
        mask = np.isnan(_matrix.data)

        mask = np.isnan(_matrix.data)
        _matrix.data[mask] = 0

        mask = np.isinf(_matrix.data)
        _matrix.data[mask] = 0
        _matrix.eliminate_zeros()

        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pAppend=append,
                                                    pEnforceInteger=False,
                                                    pFileWasH5=False,
                                                    pHic2CoolVersion=None)

        matrixFileHandlerOutput.set_matrix_variables(_matrix, cut_intervals,
                                                     nan_bins,
                                                     correction_factors,
                                                     distance_counts)

        matrixFileHandlerList.append(matrixFileHandlerOutput)

    pQueue.put(matrixFileHandlerList)
Example #26
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        trasf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        trasf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.norm:
            exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome,
                                                  chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        else:
            exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        if args.obsexpMatrix:
            trasf_matrix_obsexp[chr_range[0]:chr_range[1],
                                chr_range[0]:chr_range[1]] = lil_matrix(
                                    exp_obs_matrix_)

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.geneTrack:
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list,
                                                      args.geneTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list,
                          _start_list,
                          ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
Example #27
0
def test_save_cool_enforce_integer():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # set matrix variables
    fh_new = MatrixFileHandler(pFileType='cool', pEnforceInteger=True)

    fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins,
                                correction_factors, distance_counts)
    fh_new.matrixFile.fileWasH5 = True
    # and save it.

    fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool',
                                pMatrixFile=cool_outfile,
                                pApplyCorrectionCoolerLoad=False)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # instances, features = matrix.nonzero()
    # instances_factors = correction_factors[instances]
    # features_factors = correction_factors[features]
    # instances_factors *= features_factors

    # matrix_applied_correction = matrix.data / instances_factors

    nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)

    # os.unlink(cool_outfile)
    os.unlink(cool_outfile)
Example #28
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!: Input matrices {}; output matrices {}'
                .format(len(args.matrices), len(args.outFileName)))
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat != 'cool':
        log.error('The export of a hic file is only possible to a cool file.')
        exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool', '2D-text']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        if args.inputFormat == '2D-text':
            if args.resolutions is None:
                log.error('The resolution must be defined via --resolutions')
                sys.exit(1)
            if args.chromosomeSizes is None:
                log.error(
                    'The sizes of the chromosomes must be defined via --chromosomeSizes.'
                )
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()
            elif args.inputFormat == '2D-text':
                chrom_sizes = OrderedDict()
                size_genome = 0
                with open(args.chromosomeSizes.name, 'r') as file:
                    file_ = True
                    while file_:
                        file_ = file.readline().strip()
                        if file_ != '':
                            line_split = file_.split('\t')
                            chrom_sizes[line_split[0]] = int(line_split[1])
                            size_genome += int(line_split[1])
                chrom_sizes = list(chrom_sizes.items())

                # log.debug('chrom_sizes: {}'.format(chrom_sizes))
                args.resolutions = [int(x) for x in args.resolutions]
                # internal_matrix_size = size_genome // args.resolutions[0]

                cut_intervals = []
                for chromosome in chrom_sizes:
                    for interval in range(0, chromosome[1],
                                          args.resolutions[0]):
                        cut_intervals.append(
                            tuple([
                                chromosome[0], interval,
                                min(chromosome[1],
                                    interval + args.resolutions[0]), 1.0
                            ]))

                hic_matrix_csr = lil_matrix(
                    (len(cut_intervals), len(cut_intervals)))
                log.debug('cut_intervals {}'.format(cut_intervals[:20]))

                hic_matrix = HiCMatrix.hiCMatrix()
                hic_matrix.setMatrix(hic_matrix_csr, cut_intervals)
                # tmp_matrix = coo_matrix(())
                with open(matrix, 'r') as file:
                    for j, line in enumerate(file):
                        line_split = line.split('\t')
                        chromosome_1 = str(line_split[0])
                        start_1 = int(line_split[1])
                        end_1 = int(line_split[2])

                        chromosome_2 = str(line_split[3])
                        start_2 = int(line_split[4])
                        end_2 = int(line_split[5])

                        value = float(line_split[6])
                        bin_id_1 = hic_matrix.getRegionBinRange(
                            chromosome_1, start_1, end_1)
                        bin_id_2 = hic_matrix.getRegionBinRange(
                            chromosome_2, start_2, end_2)
                        try:
                            hic_matrix.matrix[bin_id_1, bin_id_2] = value
                        except Exception as exp:
                            log.debug(str(exp))
                        if j % 1000 == 0:
                            log.debug('{} lines computed'.format(j))
                log.debug('csr with values filled!')
                hic_matrix.matrix = hic_matrix.matrix.tocsr()

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, \
                    hic_matrix.distance_counts, hic_matrix.correction_factors

            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('cut_intervals {}'.format(cut_intervals[:20]))

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                log.debug('cool h5 homer ginteractions hicpro branch')

                if args.outputFormat in ['homer', 'ginteractions']:
                    log.debug('homer ginteractions branch')

                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                cool_metadata = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                    cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata

                log.debug('cool_metadata {}'.format(cool_metadata))
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion,
                    pHiCInfo=cool_metadata)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                log.debug('len(args.outFileName) {}, i {}'.format(
                    len(args.outFileName), i))
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)

            if args.outputFormat == 'hicpro':
                log.debug('hicpro branch')
                if len(args.matrices) == len(args.outFileName) and len(
                        args.outFileName) == len(args.bedFileHicpro):
                    log.debug('args.bedFileHicpro[i] {}'.format(
                        args.bedFileHicpro[i]))
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType=args.outputFormat,
                        pBedFileHicPro=args.bedFileHicpro[i])

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[i],
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
                else:
                    log.error(
                        'The number of input matrices, output files and bed files does not match: Input: {}; Output: {}; Bed: {}'
                        .format(len(args.matrix), len(args.outFileName),
                                len(args.bedFileHicpro)))
                    exit(1)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()
                    hic2CoolVersion = None
                    cool_metadata = None
                    if args.inputFormat == 'cool':
                        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                        cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata
                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5,
                            pHic2CoolVersion=hic2CoolVersion,
                            pHiCInfo=cool_metadata)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
def main(args=None):

    args = parse_arguments().parse_args(args)

    threads = args.threads
    merged_matrices = [None] * threads
    matrices_list = cooler.fileops.list_coolers(args.matrix)
    if len(matrices_list) < threads:
        threads = len(matrices_list)
    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_merge,
                             kwargs=dict(pMatrixName=args.matrix,
                                         pMatrixList=matrices_name_list,
                                         pRunningWindow=args.runningWindow,
                                         pNumBins=args.numBins,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                log.debug('i {}'.format(i))
                log.debug('len(queue) {}'.format(len(queue)))
                log.debug('len(merged_matrices) {}'.format(
                    len(merged_matrices)))

                merged_matrices[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
            time.sleep(1)

    merged_matrices = [item for sublist in merged_matrices for item in sublist]

    for i, hic_matrix in enumerate(merged_matrices):
        append = False
        if i > 0:
            append = True
        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pAppend=append,
                                                    pFileWasH5=False)

        matrixFileHandlerOutput.set_matrix_variables(
            hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins,
            hic_matrix.correction_factors, hic_matrix.distance_counts)
        matrixFileHandlerOutput.save(args.outFileName + '::' +
                                     matrices_list[i],
                                     pSymmetric=True,
                                     pApplyCorrection=False)
Example #30
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                if args.outputFormat in ['homer', 'ginteractions']:
                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()

                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)