コード例 #1
0
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pAppend,
                             pQueue):
    cluster_consensus_matrices_list = []
    for i, cluster in enumerate(pClusterMatricesList):
        consensus_matrix = None
        if i == 0 and pAppend:
            append = False
        else:
            append = True
        for matrix in cluster:
            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix)
            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            if consensus_matrix is None:
                consensus_matrix = _matrix
            else:
                consensus_matrix += _matrix

        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
        matrixFileHandlerOutput = MatrixFileHandler(
            pFileType='cool',
            pAppend=append,
            pEnforceInteger=False,
            pFileWasH5=False,
            pHic2CoolVersion=hic2CoolVersion)

        matrixFileHandlerOutput.set_matrix_variables(consensus_matrix,
                                                     cut_intervals, nan_bins,
                                                     correction_factors,
                                                     distance_counts)
        cluster_consensus_matrices_list.append(matrixFileHandlerOutput)

    pQueue.put(cluster_consensus_matrices_list)
コード例 #2
0
def compute_merge(pMatrixName, pMatrixList, pRunningWindow, pNumBins, pQueue):

    out_queue_list = []
    try:
        for matrix in pMatrixList:
            hic = hm.hiCMatrix(pMatrixName + '::' + matrix)

            if pRunningWindow:
                merged_matrix = running_window_merge(hic, pNumBins)
            else:
                merged_matrix = merge_bins(hic, pNumBins)

            matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                        pMatrixFile=matrix,
                                                        pEnforceInteger=False,
                                                        pFileWasH5=False)

            matrixFileHandlerOutput.set_matrix_variables(
                merged_matrix.matrix, merged_matrix.cut_intervals,
                merged_matrix.nan_bins, merged_matrix.correction_factors,
                merged_matrix.distance_counts)
            out_queue_list.append(matrixFileHandlerOutput)

        pQueue.put(out_queue_list)
    except Exception as exp:
        pQueue.put(["Fail: {}".format(str(exp))])
    return
コード例 #3
0
def test_save_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'Li_et_al_2015.cool'
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins,
                            correction_factors, distance_counts)
    # and save it.
    fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    nt.assert_equal(matrix.data, matrix_test.data)
    nt.assert_equal(cut_intervals, cut_intervals_test)
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    nt.assert_equal(correction_factors, correction_factors_test)

    os.unlink(cool_outfile)
コード例 #4
0
def load_cool_files(pMatrixName, pMatricesList, pCutIntervals, pQueue):

    matrixFileHandlerList = []
    try:
        for i, matrix in enumerate(pMatricesList):

            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool',
                pMatrixFile=pMatrixName + "::" + matrix,
                pNoCutIntervals=True)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            matrixFileHandlerOutput = MatrixFileHandler(
                pFileType='cool', pMatrixFile=matrix.split('/')[-1])

            matrixFileHandlerOutput.set_matrix_variables(
                _matrix, pCutIntervals, nan_bins, correction_factors,
                distance_counts)

            matrixFileHandlerList.append(matrixFileHandlerOutput)
    except Exception as exp:
        pQueue.put('Fail: ' + str(exp) + traceback.format_exc())
        return
    pQueue.put(matrixFileHandlerList)
コード例 #5
0
def load_cool_files(pMatricesList, pCutIntervals, pQueue):

    matrixFileHandlerList = []
    for i, matrix in enumerate(pMatricesList):
        try:
            matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                                       pMatrixFile=matrix,
                                                       pNoCutIntervals=True)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                        pMatrixFile=matrix)

            matrixFileHandlerOutput.set_matrix_variables(
                _matrix, pCutIntervals, nan_bins, correction_factors,
                distance_counts)

            matrixFileHandlerList.append(matrixFileHandlerOutput)
        except Exception as exp:
            log.warning(
                'File could not be opend and is excluded: {}. Error message: {} '
                .format(matrix, str(exp)))

    pQueue.put(matrixFileHandlerList)
コード例 #6
0
def test_save_cool_enforce_integer():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # set matrix variables
    fh_new = MatrixFileHandler(pFileType='cool', pEnforceInteger=True)

    fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins,
                                correction_factors, distance_counts)
    fh_new.matrixFile.fileWasH5 = True
    # and save it.

    fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool',
                                pMatrixFile=cool_outfile,
                                pApplyCorrectionCoolerLoad=False)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    # pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    # assert fh is not None

    # load data
    # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # instances, features = matrix.nonzero()
    # instances_factors = correction_factors[instances]
    # features_factors = correction_factors[features]
    # instances_factors *= features_factors

    # matrix_applied_correction = matrix.data / instances_factors
    # mask = matrix.data == 0
    matrix.data = np.rint(matrix.data)
    matrix.eliminate_zeros()
    # matrix_test.eliminate_zeros()

    nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)

    # os.unlink(cool_outfile)
    os.unlink(cool_outfile)
コード例 #7
0
def test_load_distance_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
    fh = MatrixFileHandler(pFileType='cool',
                           pMatrixFile=pMatrixFile,
                           pChrnameList=['1'],
                           pDistance=2500000)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins,
                            correction_factors, distance_counts)
    # and save it.
    fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    # check distance load works as expected
    instances, features = matrix.nonzero()
    distances = np.absolute(instances - features)
    # log.debug('max: {}'.format(np.max(distances)))
    mask = distances > 1  # 2.5 mb res --> all with  2.5 Mb distance
    assert np.sum(mask) == 0

    fh = MatrixFileHandler(pFileType='cool',
                           pChrnameList=['1'],
                           pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix2, _, _, _, _ = fh.load()
    instances, features = matrix2.nonzero()
    distances = np.absolute(instances - features)
    mask = distances > 1  # 2.5 mb res --> all with  2.5 Mb distance
    assert np.sum(mask) > 0

    # check if load and save matrix are equal
    nt.assert_equal(matrix.data, matrix_test.data)
    nt.assert_equal(cut_intervals, cut_intervals_test)
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    nt.assert_equal(correction_factors, correction_factors_test)

    os.unlink(cool_outfile)
コード例 #8
0
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pClusterName,
                             pQueue):
    counter = 0
    consensus_matrix = None
    try:
        matrixFileHandlerInput = MatrixFileHandler(
            pFileType='cool',
            pMatrixFile=pMatrixName + '::' + pClusterMatricesList[0])
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandlerInput.load()
        consensus_matrix = _matrix

        for j, matrix in enumerate(pClusterMatricesList[1:]):

            matrixFileHandlerInput = MatrixFileHandler(
                pFileType='cool',
                pMatrixFile=pMatrixName + '::' + matrix,
                pLoadMatrixOnly=True)
            _matrix, _, _, _, _ = matrixFileHandlerInput.load()

            _matrix = csr_matrix((_matrix[2], (_matrix[0], _matrix[1])),
                                 (_matrix[3], _matrix[3]),
                                 dtype=np.float)

            if consensus_matrix is None:
                consensus_matrix = _matrix
            else:
                consensus_matrix += _matrix

        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
        matrixFileHandlerOutput = MatrixFileHandler(
            pFileType='cool',
            pMatrixFile='consensus_matrix_cluster_' + str(pClusterName) + ':' +
            str(len(pClusterMatricesList)),
            pEnforceInteger=False,
            pFileWasH5=False,
            pHic2CoolVersion=hic2CoolVersion)

        matrixFileHandlerOutput.set_matrix_variables(consensus_matrix,
                                                     cut_intervals, nan_bins,
                                                     correction_factors,
                                                     distance_counts)
        if counter > 0:
            log.info(
                '{} matrices were not considered because of a wrong size.'.
                format(counter))
    except Exception as exp:
        log.debug('exception! {}'.format(str(exp)))
    log.debug('computaiton of {} done'.format(str(pClusterName)))
    pQueue.put(matrixFileHandlerOutput)
コード例 #9
0
def test_save_homer():
    homer_outfile = outfile + '.homer'

    # create matrixFileHandler instance with filetype 'homer'
    pMatrixFile = ROOT + 'test_matrix.homer'
    fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)  # noqa E501
    # and save it.
    fh.save(pName=homer_outfile, pSymmetric=False, pApplyCorrection=False)  # not implemented
    os.unlink(homer_outfile)
コード例 #10
0
def test_load_h5_save_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # set matrix variables
    fh_new = MatrixFileHandler(pFileType='cool')

    fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins,
                                correction_factors, distance_counts)
    fh_new.matrixFile.fileWasH5 = True
    # and save it.

    fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    instances, features = matrix.nonzero()
    instances_factors = correction_factors[instances]
    features_factors = correction_factors[features]
    instances_factors *= features_factors

    matrix_applied_correction = matrix.data / instances_factors
    nt.assert_almost_equal(matrix_applied_correction,
                           matrix_test.data,
                           decimal=1)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    correction_factors = 1 / correction_factors
    mask = np.isnan(correction_factors)
    correction_factors[mask] = 0
    mask = np.isinf(correction_factors)
    correction_factors[mask] = 0
    nt.assert_equal(correction_factors, correction_factors_test)

    # os.unlink(cool_outfile)
    os.unlink(cool_outfile)
コード例 #11
0
def test_save_h5():
    h5_outfile = outfile + '.h5'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)  # noqa E501
    # and save it.
    fh.save(h5_outfile, True, None)

    os.unlink(h5_outfile)
コード例 #12
0
def txt_to_matrixFileHandler(pMatricesList, pMatrixDimensions, pCutIntervals,
                             pQueue):

    matrixFileHandlerList = []

    for i, matrix in enumerate(pMatricesList):

        # create csr matrix
        instances = []
        features = []
        data = []
        with open(matrix, 'r') as file:
            for i, line in enumerate(file.readlines()):
                line = line.strip()
                if len(line) == 0:
                    continue
                x, y, count = line.split('\t')[:3]
                instances.append(int(x))
                features.append(int(y))
                data.append(float(count))

        cell_type = matrix.split('_')[2]

        log.debug('matrix name {}'.format(matrix))

        log.debug(
            'max(instances) {} max(features) {} pMatrixDimensions {}'.format(
                max(instances), max(features), pMatrixDimensions))
        hic_matrix = csr_matrix((data, (instances, features)),
                                (pMatrixDimensions, pMatrixDimensions),
                                dtype=np.float)

        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pMatrixFile=matrix)

        matrixFileHandlerOutput.set_matrix_variables(hic_matrix, pCutIntervals,
                                                     None, None, None)

        if matrixFileHandlerOutput.matrixFile.hic_metadata is None:
            matrixFileHandlerOutput.matrixFile.hic_metadata = {}
            matrixFileHandlerOutput.matrixFile.hic_metadata[
                'cell_type'] = cell_type

        matrixFileHandlerList.append(matrixFileHandlerOutput)

    pQueue.put(matrixFileHandlerList)
コード例 #13
0
ファイル: scHicNormalize.py プロジェクト: xjyx/scHiCExplorer
def compute_normalize(pMatrixName, pMatricesList, pArgminSum, pSumOfAll,
                      pAppend, pQueue):

    matrixFileHandlerList = []
    for i, matrix in enumerate(pMatricesList):
        if i == 0 and pAppend:
            append = False
        else:
            append = True
        matrixFileHandler = MatrixFileHandler(pFileType='cool',
                                              pMatrixFile=pMatrixName + '::' +
                                              matrix)
        _matrix, cut_intervals, nan_bins, \
            distance_counts, correction_factors = matrixFileHandler.load()
        _matrix.data = _matrix.data.astype(np.float32)
        mask = np.isnan(_matrix.data)
        _matrix.data[mask] = 0

        mask = np.isinf(_matrix.data)
        _matrix.data[mask] = 0
        adjust_factor = pSumOfAll[i] / pArgminSum
        _matrix.data /= adjust_factor
        mask = np.isnan(_matrix.data)

        mask = np.isnan(_matrix.data)
        _matrix.data[mask] = 0

        mask = np.isinf(_matrix.data)
        _matrix.data[mask] = 0
        _matrix.eliminate_zeros()

        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pAppend=append,
                                                    pEnforceInteger=False,
                                                    pFileWasH5=False,
                                                    pHic2CoolVersion=None)

        matrixFileHandlerOutput.set_matrix_variables(_matrix, cut_intervals,
                                                     nan_bins,
                                                     correction_factors,
                                                     distance_counts)

        matrixFileHandlerList.append(matrixFileHandlerOutput)

    pQueue.put(matrixFileHandlerList)
コード例 #14
0
def compute_correction(pMatrixName, pMatrixList, pCutIntervals, pQueue):

    out_queue_list = []

    print('len(pMatrixList): ' + str(len(pMatrixList)))
    try:
        for i, matrix in enumerate(pMatrixList):

            pixels, shape, _ = load_matrix(pMatrixName + '::' + matrix, None, False, None)

            # _matrix = [None, None, None]
            if 'bin1_id' in pixels.columns and 'bin2_id' in pixels.columns and 'count' in pixels.columns:
                instances = pixels['bin1_id'].values
                features = pixels['bin2_id'].values
                data = pixels['count'].values

                matrix = csr_matrix((data, (instances, features)), (shape[0], shape[1]), dtype=np.float)
            else:
                continue

            kr = kr_balancing(shape[0], shape[1],
                              matrix.count_nonzero(), matrix.indptr.astype(np.int64, copy=False),
                              matrix.indices.astype(np.int64, copy=False), matrix.data.astype(np.float64, copy=False))
            kr.computeKR()
            correction_factors = kr.get_normalisation_vector(False).todense()

            matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix)

            matrixFileHandlerOutput.set_matrix_variables(matrix,
                                                         pCutIntervals,
                                                         None,
                                                         correction_factors,
                                                         None)

            out_queue_list.append(matrixFileHandlerOutput)
            print('DOne i: ' + str(i))
    except Exception as exp:
        print('Exception: ' + str(exp))
        log.debug('Exception! {}'.format(str(exp)))
        pQueue.put(str(exp))
        return

    pQueue.put(out_queue_list)
    return
コード例 #15
0
def test_save_scool_matrixHandlersCool():

    outfile = NamedTemporaryFile(suffix='.scool',
                                 prefix='hicmatrix_scool_test')

    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=pMatrixFile)
    matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()
    matrixFileHandlerOutput1 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell1',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput1.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandlerOutput2 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell2',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput2.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandlerOutput3 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell3',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput3.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = [
        matrixFileHandlerOutput1, matrixFileHandlerOutput2,
        matrixFileHandlerOutput3
    ]

    matrixFileHandler.save(outfile.name,
                           pSymmetric=True,
                           pApplyCorrection=False)

    content_of_scool = cooler.fileops.list_scool_cells(outfile.name)
    content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
    for content in content_expected:
        assert content in content_of_scool
コード例 #16
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)
    ma.matrix.data[np.isnan(ma.matrix.data)] = 0
    ma.maskBins(ma.nan_bins)
    ma.matrix.data = ma.matrix.data
    new_intervals = hicexplorer.utilities.enlarge_bins(ma.cut_intervals)
    ma.setCutIntervals(new_intervals)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    default_range = '1000000:20000000'
    if args.range is None:
        if args.mode == "intra-chr":
            log.warning("You have not set any range. This is by default set to {} for intra-chr.".format(default_range))
        args.range = default_range
    min_dist, max_dist = args.range.split(":")
    log.info("checking range {}-{}".format(min_dist, max_dist))
    assert int(min_dist) < int(max_dist), "Error lower range is larger than upper range!"
    if args.transform == "z-score":  # use zscore matrix
        log.info("Computing z-score matrix. This may take a while.\n")
        if args.mode == 'intra-chr':
            ma.convert_to_zscore_matrix(maxdepth=int(max_dist) * 2.5, perchr=True)
        else:
            ma.convert_to_zscore_matrix(maxdepth=None, perchr=True)
    elif args.transform == "obs/exp":  # use obs/exp matrix
        log.info("Computing observed vs. expected matrix. This may take a while.\n")
        if args.mode == 'intra-chr':
            ma.convert_to_obs_exp_matrix(maxdepth=int(max_dist) * 2.5, perchr=True)
        else:
            ma.convert_to_obs_exp_matrix(maxdepth=None, perchr=True)
        if args.outFileObsExp:
            file_type = 'cool'
            if args.outFileObsExp.endswith('.h5'):
                file_type = 'h5'
            matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
            matrixFileHandlerOutput.set_matrix_variables(ma.matrix,
                                                         ma.cut_intervals,
                                                         ma.nan_bins,
                                                         ma.correction_factors,
                                                         ma.distance_counts)
            matrixFileHandlerOutput.save(args.outFileObsExp, pSymmetric=True, pApplyCorrection=False)

    M = args.numberOfBins if args.numberOfBins % 2 == 1 else args.numberOfBins + 1
    M_half = int((M - 1) // 2)

    chrom_coord = dict()
    chrom_list = ma.getChrNames()
    for chrom in chrom_list:
        first, last = ma.getChrBinRange(chrom)
        first = ma.getBinPos(first)
        last = ma.getBinPos(last - 1)
        chrom_coord[chrom] = (first[1], last[2])

    agg_info = dict()
    agg_info["chrom_coord"] = chrom_coord
    agg_info["seen"] = []
    agg_info["agg_matrix"] = OrderedDict()
    agg_info["agg_total"] = {}
    agg_info["agg_diagonals"] = OrderedDict()
    agg_info["agg_contact_position"] = {}
    agg_info["agg_center_values"] = {}
    agg_info["counter"] = 0
    agg_info["used_counter"] = 0
    agg_info["empty_mat"] = 0
    if (args.mode == 'inter-chr') and (len(agg_info["chrom_coord"]) == 1):
        exit("Error: 'inter-chr' mode can not be applied on matrices of only one chromosme.")
    if args.row_wise:
        # read bed files
        bed_intervals = args.BED.readlines()
        if args.BED2:
            bed_intervals2 = args.BED2.readlines()
        else:
            log.error("Error computing row-wise contacts requires two bed files!")
            exit("Error computing row-wise contacts requires two bed files!")
        # agg_matrix could be either per chromosome or genome wide
        aggregate_contacts_per_row(bed_intervals, bed_intervals2, agg_info, ma, chrom_list, M_half, args.largeRegionsOperation, args.range, args.transform, mode=args.mode, perChr=args.perChr)
    else:  # not row-wise
        # read and sort bed files.
        bed_intervals = read_bed_per_chrom(args.BED, chrom_list)
        if args.BED2:
            bed_intervals2 = read_bed_per_chrom(args.BED2, chrom_list)
        else:
            bed_intervals2 = bed_intervals
        # agg_matrix could be either per chromosome or genome wide
        aggregate_contacts(bed_intervals, bed_intervals2, agg_info, ma, M_half, args.largeRegionsOperation, args.range, args.transform, mode=args.mode, perChr=args.perChr)

    if args.kmeans is not None:
        cluster_ids = cluster_matrices(agg_info["agg_matrix"], args.kmeans, method='kmeans', how=args.howToCluster)
        num_clusters = args.kmeans
    elif args.hclust is not None:
        log.info("Performing hierarchical clustering."
                 "Please note that it might be very slow for large datasets.\n")
        cluster_ids = cluster_matrices(agg_info["agg_matrix"], args.hclust, method='hierarchical',
                                       how=args.howToCluster)
        num_clusters = args.hclust
    else:
        # make a 'fake' clustering to generalize the plotting of the submatrices
        cluster_ids = {}
        num_clusters = 1
        for k in agg_info["agg_matrix"].keys():
            cluster_ids[k] = [range(len(agg_info["agg_matrix"][k]))]
    if len(agg_info["agg_matrix"]) == 0:
        exit("No susbmatrix found to be aggregated.")
    plot_aggregated_contacts(agg_info["agg_matrix"], agg_info["agg_contact_position"], cluster_ids, num_clusters, M_half, args)

    if args.outFileContactPairs:
        for idx, chrom in enumerate(agg_info["agg_matrix"]):
            if chrom not in bed_intervals or chrom not in bed_intervals2:
                continue
            for cluster_number, cluster_indices in enumerate(cluster_ids[chrom]):
                center_values_to_order = np.array(agg_info["agg_center_values"][chrom])[cluster_indices]
                center_values_order = np.argsort(center_values_to_order)[::-1]

                output_name = "{file}_{chrom}_cluster_{id}.tab".format(file=args.outFileContactPairs,
                                                                       chrom=chrom, id=cluster_number + 1)
                with open(output_name, 'w') as fh:
                    for cl_idx in center_values_order:
                        value = center_values_to_order[cl_idx]
                        start, end, start2, end2 = agg_info["agg_contact_position"][chrom][cl_idx]
                        fh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(chrom, start, end, chrom, start2, end2, value))

    # plot the diagonals
    # the diagonals plot is useful to see individual cases and if they had a contact in the center
    if args.diagnosticHeatmapFile:
        plot_diagnostic_heatmaps(agg_info["agg_diagonals"], cluster_ids, M_half, args)
コード例 #17
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        trasf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        trasf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.norm:
            exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome,
                                                  chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        else:
            exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        if args.obsexpMatrix:
            trasf_matrix_obsexp[chr_range[0]:chr_range[1],
                                chr_range[0]:chr_range[1]] = lil_matrix(
                                    exp_obs_matrix_)

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.geneTrack:
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list,
                                                      args.geneTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list,
                          _start_list,
                          ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
コード例 #18
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!: Input matrices {}; output matrices {}'
                .format(len(args.matrices), len(args.outFileName)))
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat != 'cool':
        log.error('The export of a hic file is only possible to a cool file.')
        exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool', '2D-text']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        if args.inputFormat == '2D-text':
            if args.resolutions is None:
                log.error('The resolution must be defined via --resolutions')
                sys.exit(1)
            if args.chromosomeSizes is None:
                log.error(
                    'The sizes of the chromosomes must be defined via --chromosomeSizes.'
                )
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()
            elif args.inputFormat == '2D-text':
                chrom_sizes = OrderedDict()
                size_genome = 0
                with open(args.chromosomeSizes.name, 'r') as file:
                    file_ = True
                    while file_:
                        file_ = file.readline().strip()
                        if file_ != '':
                            line_split = file_.split('\t')
                            chrom_sizes[line_split[0]] = int(line_split[1])
                            size_genome += int(line_split[1])
                chrom_sizes = list(chrom_sizes.items())

                # log.debug('chrom_sizes: {}'.format(chrom_sizes))
                args.resolutions = [int(x) for x in args.resolutions]
                # internal_matrix_size = size_genome // args.resolutions[0]

                cut_intervals = []
                for chromosome in chrom_sizes:
                    for interval in range(0, chromosome[1],
                                          args.resolutions[0]):
                        cut_intervals.append(
                            tuple([
                                chromosome[0], interval,
                                min(chromosome[1],
                                    interval + args.resolutions[0]), 1.0
                            ]))

                hic_matrix_csr = lil_matrix(
                    (len(cut_intervals), len(cut_intervals)))
                log.debug('cut_intervals {}'.format(cut_intervals[:20]))

                hic_matrix = HiCMatrix.hiCMatrix()
                hic_matrix.setMatrix(hic_matrix_csr, cut_intervals)
                # tmp_matrix = coo_matrix(())
                with open(matrix, 'r') as file:
                    for j, line in enumerate(file):
                        line_split = line.split('\t')
                        chromosome_1 = str(line_split[0])
                        start_1 = int(line_split[1])
                        end_1 = int(line_split[2])

                        chromosome_2 = str(line_split[3])
                        start_2 = int(line_split[4])
                        end_2 = int(line_split[5])

                        value = float(line_split[6])
                        bin_id_1 = hic_matrix.getRegionBinRange(
                            chromosome_1, start_1, end_1)
                        bin_id_2 = hic_matrix.getRegionBinRange(
                            chromosome_2, start_2, end_2)
                        try:
                            hic_matrix.matrix[bin_id_1, bin_id_2] = value
                        except Exception as exp:
                            log.debug(str(exp))
                        if j % 1000 == 0:
                            log.debug('{} lines computed'.format(j))
                log.debug('csr with values filled!')
                hic_matrix.matrix = hic_matrix.matrix.tocsr()

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, \
                    hic_matrix.distance_counts, hic_matrix.correction_factors

            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('cut_intervals {}'.format(cut_intervals[:20]))

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                log.debug('cool h5 homer ginteractions hicpro branch')

                if args.outputFormat in ['homer', 'ginteractions']:
                    log.debug('homer ginteractions branch')

                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                cool_metadata = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                    cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata

                log.debug('cool_metadata {}'.format(cool_metadata))
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion,
                    pHiCInfo=cool_metadata)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                log.debug('len(args.outFileName) {}, i {}'.format(
                    len(args.outFileName), i))
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)

            if args.outputFormat == 'hicpro':
                log.debug('hicpro branch')
                if len(args.matrices) == len(args.outFileName) and len(
                        args.outFileName) == len(args.bedFileHicpro):
                    log.debug('args.bedFileHicpro[i] {}'.format(
                        args.bedFileHicpro[i]))
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType=args.outputFormat,
                        pBedFileHicPro=args.bedFileHicpro[i])

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[i],
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
                else:
                    log.error(
                        'The number of input matrices, output files and bed files does not match: Input: {}; Output: {}; Bed: {}'
                        .format(len(args.matrix), len(args.outFileName),
                                len(args.bedFileHicpro)))
                    exit(1)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()
                    hic2CoolVersion = None
                    cool_metadata = None
                    if args.inputFormat == 'cool':
                        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                        cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata
                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5,
                            pHic2CoolVersion=hic2CoolVersion,
                            pHiCInfo=cool_metadata)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
コード例 #19
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    threads = args.threads
    merged_matrices = [None] * threads
    matrices_list = cooler.fileops.list_coolers(args.matrix)
    if len(matrices_list) < threads:
        threads = len(matrices_list)
    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_merge,
                             kwargs=dict(pMatrixName=args.matrix,
                                         pMatrixList=matrices_name_list,
                                         pRunningWindow=args.runningWindow,
                                         pNumBins=args.numBins,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                log.debug('i {}'.format(i))
                log.debug('len(queue) {}'.format(len(queue)))
                log.debug('len(merged_matrices) {}'.format(
                    len(merged_matrices)))

                merged_matrices[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
            time.sleep(1)

    merged_matrices = [item for sublist in merged_matrices for item in sublist]

    for i, hic_matrix in enumerate(merged_matrices):
        append = False
        if i > 0:
            append = True
        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pAppend=append,
                                                    pFileWasH5=False)

        matrixFileHandlerOutput.set_matrix_variables(
            hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins,
            hic_matrix.correction_factors, hic_matrix.distance_counts)
        matrixFileHandlerOutput.save(args.outFileName + '::' +
                                     matrices_list[i],
                                     pSymmetric=True,
                                     pApplyCorrection=False)
コード例 #20
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                if args.outputFormat in ['homer', 'ginteractions']:
                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()

                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
コード例 #21
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    matplotlib.rcParams['pdf.fonttype'] = 42

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)
    ma.matrix.data[np.isnan(ma.matrix.data)] = 0
    ma.maskBins(ma.nan_bins)
    ma.matrix.data = ma.matrix.data
    new_intervals = hicexplorer.utilities.enlarge_bins(ma.cut_intervals)
    ma.setCutIntervals(new_intervals)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    default_range = '1000000:20000000'
    if args.range is not None:
        if (args.mode == "inter-chr") or (args.mode == "all"):
            log.info("--range is ineffective for inter-chr and all mode.")
    if args.range is None:
        if args.mode == "intra-chr":
            log.warning("You have not set any range. This is by default set to {} for intra-chr.".format(default_range))
        args.range = default_range
    min_dist, max_dist = args.range.split(":")
    if args.mode == "intra-chr":
        log.info("checking range {}-{}".format(min_dist, max_dist))
        assert int(min_dist) < int(max_dist), "Error lower range is larger than upper range!"
    if args.transform == "z-score":  # use zscore matrix
        log.info("Computing z-score matrix. This may take a while.\n")
        if args.mode == 'intra-chr':
            ma.convert_to_zscore_matrix(maxdepth=int(max_dist) * 2.5, perchr=True)
        else:
            ma.convert_to_zscore_matrix(maxdepth=None, perchr=True)
    elif args.transform == "obs/exp":  # use obs/exp matrix
        log.info("Computing observed vs. expected matrix. This may take a while.\n")
        if args.mode == 'intra-chr':
            ma.convert_to_obs_exp_matrix(maxdepth=int(max_dist) * 2.5, perchr=True)
        else:
            ma.convert_to_obs_exp_matrix(maxdepth=None, perchr=True)
        if args.outFileObsExp:
            file_type = 'cool'
            if args.outFileObsExp.endswith('.h5'):
                file_type = 'h5'
            matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
            matrixFileHandlerOutput.set_matrix_variables(ma.matrix,
                                                         ma.cut_intervals,
                                                         ma.nan_bins,
                                                         ma.correction_factors,
                                                         ma.distance_counts)
            matrixFileHandlerOutput.save(args.outFileObsExp, pSymmetric=True, pApplyCorrection=False)

    M = args.numberOfBins if args.numberOfBins % 2 == 1 else args.numberOfBins + 1
    M_half = int((M - 1) // 2)

    chrom_coord = dict()
    chrom_list = ma.getChrNames()
    for chrom in chrom_list:
        first, last = ma.getChrBinRange(chrom)
        first = ma.getBinPos(first)
        last = ma.getBinPos(last - 1)
        chrom_coord[chrom] = (first[1], last[2])

    agg_info = dict()
    agg_info["chrom_coord"] = chrom_coord  # coordinates of each chrom
    agg_info["seen"] = []  # seen bins
    agg_info["agg_matrix"] = {chrom: {} for chrom in chrom_list}  # important
    agg_info["agg_total"] = {chrom: {} for chrom in chrom_list}
    agg_info["agg_diagonals"] = {chrom: {} for chrom in chrom_list}
    agg_info["agg_contact_position"] = {chrom: {} for chrom in chrom_list}  # important
    agg_info["agg_center_values"] = {chrom: {} for chrom in chrom_list}  # important
    agg_info["counter"] = 0
    agg_info["used_counter"] = 0
    agg_info["empty_mat"] = 0

    log.debug('agg_info["agg_matrix"] {}'.format(agg_info["agg_matrix"]))
    if (args.mode == 'inter-chr') and (len(agg_info["chrom_coord"]) == 1):
        exit("Error: 'inter-chr' mode can not be applied on matrices of only one chromosme.")
    if (args.mode == 'inter-chr') and (args.perChr):
        exit("Error: 'inter-chr' mode can not be used along with --perChr.")
    if (args.mode == 'all') and (args.perChr):
        exit("Error: 'all' mode can not be used along with --perChr.")
    if args.row_wise:
        # read bed files
        bed_intervals = args.BED.readlines()
        if args.BED2:
            bed_intervals2 = args.BED2.readlines()
        else:
            log.error("Error computing row-wise contacts requires two bed files!")
            exit("Error computing row-wise contacts requires two bed files!")
        if len(bed_intervals) != len(bed_intervals2):
            log.error("row_wise only works if both bed files have the same length.")
            exit("Error row_wise only works if both bed files have the same length.")
        # agg_matrix could be either per chromosome or genome wide
        aggregate_contacts_per_row(bed_intervals, bed_intervals2, agg_info, ma, chrom_list,
                                   M_half, args.largeRegionsOperation, args.range,
                                   args.transform, mode=args.mode, perChr=args.perChr, pConsiderStrandDirection=args.considerStrandDirection)
    else:  # not row-wise
        # read and sort bed files.
        bed_intervals = read_bed_per_chrom(args.BED, chrom_list, args.considerStrandDirection)
        if args.BED2:
            bed_intervals2 = read_bed_per_chrom(args.BED2, chrom_list, args.considerStrandDirection)
        else:
            bed_intervals2 = bed_intervals
        # agg_matrix could be either per chromosome or genome wide
        aggregate_contacts(bed_intervals, bed_intervals2, agg_info, ma, M_half,
                           args.largeRegionsOperation, args.range, args.transform,
                           mode=args.mode, pConsiderStrandDirection=args.considerStrandDirection)
    if len(agg_info["agg_matrix"]) == 0:
        exit("No susbmatrix found to be aggregated.")

    if args.kmeans is not None:
        assert(args.kmeans > 1)
        if args.perChr == True:
            clustered_info = cluster_matrices(agg_info,
                                              k=args.kmeans, method='kmeans', how=args.howToCluster,
                                              perChr=args.perChr, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        else:
            clustered_info = cluster_matrices(agg_info,
                                              k=args.kmeans, method='kmeans', how=args.howToCluster,
                                              perChr=False, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        num_clusters = args.kmeans
    elif args.hclust is not None:
        assert(args.hclust > 1)
        log.info("Performing hierarchical clustering."
                 "Please note that it might be very slow for large datasets.\n")
        if args.perChr == True:
            clustered_info = cluster_matrices(agg_info,
                                              k=args.hclust, method='hierarchical',
                                              how=args.howToCluster,
                                              perChr=args.perChr, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        else:
            clustered_info = cluster_matrices(agg_info,
                                              k=args.hclust, method='hierarchical',
                                              how=args.howToCluster,
                                              perChr=False, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        num_clusters = args.hclust
    else:
        # make a 'fake' clustering to generalize the plotting of the submatrices
        k = 1
        if args.perChr == True:
            clustered_info = cluster_matrices(agg_info, k=k, method='no_clust',
                                              how='full', perChr=args.perChr, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)

        else:
            clustered_info = cluster_matrices(agg_info, k=k, method='no_clust',
                                              how='full', perChr=False, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        num_clusters = k

    plot_aggregated_contacts(clustered_info, num_clusters, M_half, args)

    # plot the diagonals
    # the diagonals plot is useful to see individual cases and if they had a contact in the center
    if args.diagnosticHeatmapFile:
        plot_diagnostic_heatmaps(clustered_info, M_half, args)
コード例 #22
0
def main(args=None):
    # args_string
    args = parse_arguments().parse_args(args)
    hicmatrix_adjusted_objects = []
    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cooler.fileops.list_coolers(matrices_name)
    if args.createSubmatrix is not None and args.regions is None and args.chromosomes is None:
        for matrix in matrices_list[:args.createSubmatrix]:
            cooler.fileops.cp(args.matrix + '::' + matrix,
                              args.outFileName + '::' + matrix)
        exit(0)

    input_count_matrices = len(matrices_list)
    # log.debug('args.createSubmatrix {}, args.action {}, args.chromosomes {}'.format(args.createSubmatrix, args.action, args.chromosomes ))
    # exit()
    if threads > len(matrices_list):
        threads = len(matrices_list)

    all_data_collected = False
    thread_done = [False] * threads
    hicmatrix_adjusted_objects_threads = [None] * threads
    keep_matrices_list_threads = [None] * threads

    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_adjust_matrix,
                             kwargs=dict(pMatrixName=matrices_name,
                                         pMatricesList=matrices_name_list,
                                         pArgs=args,
                                         pQueue=queue[i]))

        process[i].start()
    log.debug("foo")
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                hicmatrix_adjusted_objects_threads[
                    i], keep_matrices_list_threads[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    # TODO: implement this!
    hicmatrix_adjusted_objects = [
        item for sublist in hicmatrix_adjusted_objects_threads
        for item in sublist
    ]
    keep_matrices_list = [
        item for sublist in keep_matrices_list_threads for item in sublist
    ]

    log.debug('length out {}'.format(len(hicmatrix_adjusted_objects)))
    for i, hic_matrix in enumerate(hicmatrix_adjusted_objects):
        if args.createSubmatrix and i > args.createSubmatrix:
            break
        append = True
        if i == 0:
            append = False

        if keep_matrices_list[i] == 0:
            continue

        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pAppend=append,
                                                    pEnforceInteger=False,
                                                    pFileWasH5=False,
                                                    pHic2CoolVersion=None)

        matrixFileHandlerOutput.set_matrix_variables(
            hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins,
            hic_matrix.correction_factors, hic_matrix.distance_counts)
        matrixFileHandlerOutput.save(args.outFileName + '::' +
                                     matrices_list[i],
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    broken_count = input_count_matrices - np.sum(np.array(keep_matrices_list))
    print(
        'Out of {} matrices, {} were removed because they were broken.'.format(
            input_count_matrices, broken_count))
コード例 #23
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:
                out_name = args.outFileName[i].split('.')
                out_name[-2] = split_name[-2] + '_' + str(resolution)
                out_name = '.'.join(out_name)
                for resolution in args.resolutions:
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer)

            _matrix, cut_intervals, nan_bins, \
                correction_factors, distance_counts = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i] + '.' +
                                             args.outputFormat,
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define either one matrix and many resolutions which should be created.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()

                    for resolution in args.resolutions:
                        _mergeFactor = int(resolution) // bin_size
                        merged_matrix = hicMergeMatrixBins.merge_bins(
                            hic_matrix, _mergeFactor)
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer)
                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '.mcool' +
                            '::/resolutions/' + str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool')
                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '.mcool' + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)