Ejemplo n.º 1
0
def test_save_scool_pixeltables():
    outfile = NamedTemporaryFile(suffix='.scool',
                                 prefix='hicmatrix_scool_test')

    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'

    cooler_obj = cooler.Cooler(pMatrixFile)
    bins = cooler_obj.bins()[:]
    pixels = cooler_obj.pixels()[:]

    pixelsList = [pixels, pixels, pixels]
    matrices_list = ['cell1', 'cell2', 'cell3']
    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = None
    matrixFileHandler.matrixFile.bins = bins
    matrixFileHandler.matrixFile.pixel_list = pixelsList
    matrixFileHandler.matrixFile.name_list = matrices_list
    matrixFileHandler.save(outfile.name,
                           pSymmetric=True,
                           pApplyCorrection=False)

    content_of_scool = cooler.fileops.list_scool_cells(outfile.name)
    content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
    for content in content_expected:
        assert content in content_of_scool
Ejemplo n.º 2
0
def test_save_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'Li_et_al_2015.cool'
    fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins,
                            correction_factors, distance_counts)
    # and save it.
    fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    nt.assert_equal(matrix.data, matrix_test.data)
    nt.assert_equal(cut_intervals, cut_intervals_test)
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    nt.assert_equal(correction_factors, correction_factors_test)

    os.unlink(cool_outfile)
Ejemplo n.º 3
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)
    matrix_file_handler_object_list = []
    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=args.matrices[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    matrices_list = args.matrices

    threads = args.threads

    matrixFileHandler_list = [None] * args.threads
    process = [None] * args.threads
    queue = [None] * args.threads

    thread_done = [False] * args.threads
    matricesPerThread = len(matrices_list) // threads

    for i in range(args.threads):
        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=load_cool_files,
                             kwargs=dict(pMatricesList=matrices_name_list,
                                         pCutIntervals=cut_intervals_all,
                                         pQueue=queue[i]))
        process[i].start()

    all_data_collected = False
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    matrix_file_handler_object_list = [
        item for sublist in matrixFileHandler_list for item in sublist
    ]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
def test_save_cool_enforce_integer():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # set matrix variables
    fh_new = MatrixFileHandler(pFileType='cool', pEnforceInteger=True)

    fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins,
                                correction_factors, distance_counts)
    fh_new.matrixFile.fileWasH5 = True
    # and save it.

    fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool',
                                pMatrixFile=cool_outfile,
                                pApplyCorrectionCoolerLoad=False)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    # pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    # assert fh is not None

    # load data
    # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # instances, features = matrix.nonzero()
    # instances_factors = correction_factors[instances]
    # features_factors = correction_factors[features]
    # instances_factors *= features_factors

    # matrix_applied_correction = matrix.data / instances_factors
    # mask = matrix.data == 0
    matrix.data = np.rint(matrix.data)
    matrix.eliminate_zeros()
    # matrix_test.eliminate_zeros()

    nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)

    # os.unlink(cool_outfile)
    os.unlink(cool_outfile)
def test_load_distance_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'cool'
    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'
    fh = MatrixFileHandler(pFileType='cool',
                           pMatrixFile=pMatrixFile,
                           pChrnameList=['1'],
                           pDistance=2500000)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins,
                            correction_factors, distance_counts)
    # and save it.
    fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    # check distance load works as expected
    instances, features = matrix.nonzero()
    distances = np.absolute(instances - features)
    # log.debug('max: {}'.format(np.max(distances)))
    mask = distances > 1  # 2.5 mb res --> all with  2.5 Mb distance
    assert np.sum(mask) == 0

    fh = MatrixFileHandler(pFileType='cool',
                           pChrnameList=['1'],
                           pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix2, _, _, _, _ = fh.load()
    instances, features = matrix2.nonzero()
    distances = np.absolute(instances - features)
    mask = distances > 1  # 2.5 mb res --> all with  2.5 Mb distance
    assert np.sum(mask) > 0

    # check if load and save matrix are equal
    nt.assert_equal(matrix.data, matrix_test.data)
    nt.assert_equal(cut_intervals, cut_intervals_test)
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    nt.assert_equal(correction_factors, correction_factors_test)

    os.unlink(cool_outfile)
Ejemplo n.º 6
0
def test_save_scool_matrixHandlersCool():

    outfile = NamedTemporaryFile(suffix='.scool',
                                 prefix='hicmatrix_scool_test')

    pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool'

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=pMatrixFile)
    matrix, cut_intervals, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()
    matrixFileHandlerOutput1 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell1',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput1.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandlerOutput2 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell2',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput2.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandlerOutput3 = MatrixFileHandler(pFileType='cool',
                                                 pMatrixFile='cell3',
                                                 pEnforceInteger=False,
                                                 pFileWasH5=False,
                                                 pHic2CoolVersion=None)
    matrixFileHandlerOutput3.set_matrix_variables(matrix, cut_intervals,
                                                  nan_bins, correction_factors,
                                                  distance_counts)

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = [
        matrixFileHandlerOutput1, matrixFileHandlerOutput2,
        matrixFileHandlerOutput3
    ]

    matrixFileHandler.save(outfile.name,
                           pSymmetric=True,
                           pApplyCorrection=False)

    content_of_scool = cooler.fileops.list_scool_cells(outfile.name)
    content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
    for content in content_expected:
        assert content in content_of_scool
Ejemplo n.º 7
0
def test_save_homer():
    homer_outfile = outfile + '.homer'

    # create matrixFileHandler instance with filetype 'homer'
    pMatrixFile = ROOT + 'test_matrix.homer'
    fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)  # noqa E501
    # and save it.
    fh.save(pName=homer_outfile, pSymmetric=False, pApplyCorrection=False)  # not implemented
    os.unlink(homer_outfile)
Ejemplo n.º 8
0
def test_load_h5_save_cool():
    cool_outfile = outfile + '.cool'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load(
    )

    # set matrix variables
    fh_new = MatrixFileHandler(pFileType='cool')

    fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins,
                                correction_factors, distance_counts)
    fh_new.matrixFile.fileWasH5 = True
    # and save it.

    fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True)

    fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile)
    assert fh_test is not None
    matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load(
    )

    instances, features = matrix.nonzero()
    instances_factors = correction_factors[instances]
    features_factors = correction_factors[features]
    instances_factors *= features_factors

    matrix_applied_correction = matrix.data / instances_factors
    nt.assert_almost_equal(matrix_applied_correction,
                           matrix_test.data,
                           decimal=1)
    nt.assert_equal(len(cut_intervals), len(cut_intervals_test))
    nt.assert_equal(nan_bins, nan_bins_test)
    nt.assert_equal(distance_counts, distance_counts_test)
    correction_factors = 1 / correction_factors
    mask = np.isnan(correction_factors)
    correction_factors[mask] = 0
    mask = np.isinf(correction_factors)
    correction_factors[mask] = 0
    nt.assert_equal(correction_factors, correction_factors_test)

    # os.unlink(cool_outfile)
    os.unlink(cool_outfile)
Ejemplo n.º 9
0
def test_save_h5():
    h5_outfile = outfile + '.h5'

    # create matrixFileHandler instance with filetype 'h5'
    pMatrixFile = ROOT + 'Li_et_al_2015.h5'
    fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile)
    assert fh is not None

    # load data
    matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load()
    # set matrix variables
    fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts)  # noqa E501
    # and save it.
    fh.save(h5_outfile, True, None)

    os.unlink(h5_outfile)
Ejemplo n.º 10
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:
                out_name = args.outFileName[i].split('.')
                out_name[-2] = split_name[-2] + '_' + str(resolution)
                out_name = '.'.join(out_name)
                for resolution in args.resolutions:
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer)

            _matrix, cut_intervals, nan_bins, \
                correction_factors, distance_counts = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i] + '.' +
                                             args.outputFormat,
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define either one matrix and many resolutions which should be created.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()

                    for resolution in args.resolutions:
                        _mergeFactor = int(resolution) // bin_size
                        merged_matrix = hicMergeMatrixBins.merge_bins(
                            hic_matrix, _mergeFactor)
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer)
                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '.mcool' +
                            '::/resolutions/' + str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool')
                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '.mcool' + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
Ejemplo n.º 11
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    threads = args.threads
    merged_matrices = [None] * threads
    matrices_list = cooler.fileops.list_coolers(args.matrix)
    if len(matrices_list) < threads:
        threads = len(matrices_list)
    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_merge,
                             kwargs=dict(pMatrixName=args.matrix,
                                         pMatrixList=matrices_name_list,
                                         pRunningWindow=args.runningWindow,
                                         pNumBins=args.numBins,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                log.debug('i {}'.format(i))
                log.debug('len(queue) {}'.format(len(queue)))
                log.debug('len(merged_matrices) {}'.format(
                    len(merged_matrices)))

                merged_matrices[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
            time.sleep(1)

    merged_matrices = [item for sublist in merged_matrices for item in sublist]

    for i, hic_matrix in enumerate(merged_matrices):
        append = False
        if i > 0:
            append = True
        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pAppend=append,
                                                    pFileWasH5=False)

        matrixFileHandlerOutput.set_matrix_variables(
            hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins,
            hic_matrix.correction_factors, hic_matrix.distance_counts)
        matrixFileHandlerOutput.save(args.outFileName + '::' +
                                     matrices_list[i],
                                     pSymmetric=True,
                                     pApplyCorrection=False)
Ejemplo n.º 12
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!: Input matrices {}; output matrices {}'
                .format(len(args.matrices), len(args.outFileName)))
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat != 'cool':
        log.error('The export of a hic file is only possible to a cool file.')
        exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool', '2D-text']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        if args.inputFormat == '2D-text':
            if args.resolutions is None:
                log.error('The resolution must be defined via --resolutions')
                sys.exit(1)
            if args.chromosomeSizes is None:
                log.error(
                    'The sizes of the chromosomes must be defined via --chromosomeSizes.'
                )
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()
            elif args.inputFormat == '2D-text':
                chrom_sizes = OrderedDict()
                size_genome = 0
                with open(args.chromosomeSizes.name, 'r') as file:
                    file_ = True
                    while file_:
                        file_ = file.readline().strip()
                        if file_ != '':
                            line_split = file_.split('\t')
                            chrom_sizes[line_split[0]] = int(line_split[1])
                            size_genome += int(line_split[1])
                chrom_sizes = list(chrom_sizes.items())

                # log.debug('chrom_sizes: {}'.format(chrom_sizes))
                args.resolutions = [int(x) for x in args.resolutions]
                # internal_matrix_size = size_genome // args.resolutions[0]

                cut_intervals = []
                for chromosome in chrom_sizes:
                    for interval in range(0, chromosome[1],
                                          args.resolutions[0]):
                        cut_intervals.append(
                            tuple([
                                chromosome[0], interval,
                                min(chromosome[1],
                                    interval + args.resolutions[0]), 1.0
                            ]))

                hic_matrix_csr = lil_matrix(
                    (len(cut_intervals), len(cut_intervals)))
                log.debug('cut_intervals {}'.format(cut_intervals[:20]))

                hic_matrix = HiCMatrix.hiCMatrix()
                hic_matrix.setMatrix(hic_matrix_csr, cut_intervals)
                # tmp_matrix = coo_matrix(())
                with open(matrix, 'r') as file:
                    for j, line in enumerate(file):
                        line_split = line.split('\t')
                        chromosome_1 = str(line_split[0])
                        start_1 = int(line_split[1])
                        end_1 = int(line_split[2])

                        chromosome_2 = str(line_split[3])
                        start_2 = int(line_split[4])
                        end_2 = int(line_split[5])

                        value = float(line_split[6])
                        bin_id_1 = hic_matrix.getRegionBinRange(
                            chromosome_1, start_1, end_1)
                        bin_id_2 = hic_matrix.getRegionBinRange(
                            chromosome_2, start_2, end_2)
                        try:
                            hic_matrix.matrix[bin_id_1, bin_id_2] = value
                        except Exception as exp:
                            log.debug(str(exp))
                        if j % 1000 == 0:
                            log.debug('{} lines computed'.format(j))
                log.debug('csr with values filled!')
                hic_matrix.matrix = hic_matrix.matrix.tocsr()

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, \
                    hic_matrix.distance_counts, hic_matrix.correction_factors

            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('cut_intervals {}'.format(cut_intervals[:20]))

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                log.debug('cool h5 homer ginteractions hicpro branch')

                if args.outputFormat in ['homer', 'ginteractions']:
                    log.debug('homer ginteractions branch')

                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                cool_metadata = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                    cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata

                log.debug('cool_metadata {}'.format(cool_metadata))
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion,
                    pHiCInfo=cool_metadata)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                log.debug('len(args.outFileName) {}, i {}'.format(
                    len(args.outFileName), i))
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)

            if args.outputFormat == 'hicpro':
                log.debug('hicpro branch')
                if len(args.matrices) == len(args.outFileName) and len(
                        args.outFileName) == len(args.bedFileHicpro):
                    log.debug('args.bedFileHicpro[i] {}'.format(
                        args.bedFileHicpro[i]))
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType=args.outputFormat,
                        pBedFileHicPro=args.bedFileHicpro[i])

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[i],
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
                else:
                    log.error(
                        'The number of input matrices, output files and bed files does not match: Input: {}; Output: {}; Bed: {}'
                        .format(len(args.matrix), len(args.outFileName),
                                len(args.bedFileHicpro)))
                    exit(1)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()
                    hic2CoolVersion = None
                    cool_metadata = None
                    if args.inputFormat == 'cool':
                        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                        cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata
                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5,
                            pHic2CoolVersion=hic2CoolVersion,
                            pHiCInfo=cool_metadata)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
Ejemplo n.º 13
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    threads = args.threads
    matrixFileHandler_list = [None] * threads
    matrices_list = cell_name_list(args.matrix)
    if len(matrices_list) < threads:
        threads = len(matrices_list)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=args.matrix + "::" + matrices_list[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    print('Threads: ' + str(threads))
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_correction, kwargs=dict(
            pMatrixName=args.matrix,
            pMatrixList=matrices_name_list,
            pCutIntervals=cut_intervals_all,
            pQueue=queue[i]
        )
        )

        process[i].start()

    fail_flag = False
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                # csr_matrix_worker = queue[i].get()
                if isinstance(matrixFileHandler_list[i], str):
                    log.error('{}'.format(matrixFileHandler_list[i]))
                    fail_flag = True
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    if fail_flag:
        exit(1)
    matrix_file_handler_object_list = [item for sublist in matrixFileHandler_list for item in sublist]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
    matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False)
Ejemplo n.º 14
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                if args.outputFormat in ['homer', 'ginteractions']:
                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()

                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
Ejemplo n.º 15
0
def main(args=None):
    # args_string
    args = parse_arguments().parse_args(args)
    hicmatrix_adjusted_objects = []
    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cooler.fileops.list_coolers(matrices_name)
    if args.createSubmatrix is not None and args.regions is None and args.chromosomes is None:
        for matrix in matrices_list[:args.createSubmatrix]:
            cooler.fileops.cp(args.matrix + '::' + matrix,
                              args.outFileName + '::' + matrix)
        exit(0)

    input_count_matrices = len(matrices_list)
    # log.debug('args.createSubmatrix {}, args.action {}, args.chromosomes {}'.format(args.createSubmatrix, args.action, args.chromosomes ))
    # exit()
    if threads > len(matrices_list):
        threads = len(matrices_list)

    all_data_collected = False
    thread_done = [False] * threads
    hicmatrix_adjusted_objects_threads = [None] * threads
    keep_matrices_list_threads = [None] * threads

    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_adjust_matrix,
                             kwargs=dict(pMatrixName=matrices_name,
                                         pMatricesList=matrices_name_list,
                                         pArgs=args,
                                         pQueue=queue[i]))

        process[i].start()
    log.debug("foo")
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                hicmatrix_adjusted_objects_threads[
                    i], keep_matrices_list_threads[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    # TODO: implement this!
    hicmatrix_adjusted_objects = [
        item for sublist in hicmatrix_adjusted_objects_threads
        for item in sublist
    ]
    keep_matrices_list = [
        item for sublist in keep_matrices_list_threads for item in sublist
    ]

    log.debug('length out {}'.format(len(hicmatrix_adjusted_objects)))
    for i, hic_matrix in enumerate(hicmatrix_adjusted_objects):
        if args.createSubmatrix and i > args.createSubmatrix:
            break
        append = True
        if i == 0:
            append = False

        if keep_matrices_list[i] == 0:
            continue

        matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                    pAppend=append,
                                                    pEnforceInteger=False,
                                                    pFileWasH5=False,
                                                    pHic2CoolVersion=None)

        matrixFileHandlerOutput.set_matrix_variables(
            hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins,
            hic_matrix.correction_factors, hic_matrix.distance_counts)
        matrixFileHandlerOutput.save(args.outFileName + '::' +
                                     matrices_list[i],
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    broken_count = input_count_matrices - np.sum(np.array(keep_matrices_list))
    print(
        'Out of {} matrices, {} were removed because they were broken.'.format(
            input_count_matrices, broken_count))
def main(args=None):

    args = parse_arguments().parse_args(args)

    clusters = {}
    with open(args.clusters, 'r') as cluster_file:
        for i, line in enumerate(cluster_file.readlines()):
            line = line.strip()
            file_path, cluster = line.split(' ')
            if not file_path.startswith('/cells'):
                file_path = '/cells/' + file_path
            if int(cluster) in clusters:
                clusters[int(cluster)].append(file_path)
            else:
                clusters[int(cluster)] = [file_path]

    threads = args.threads
    if len(clusters) < threads:
        threads = len(clusters)

    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    queue = [None] * threads
    process = [None] * threads

    all_data_processed = False
    all_threads_done = False
    count = 0
    matrixFileHandlerObjects_list = []
    while not all_data_processed or not all_threads_done:
        for i in range(threads):

            if queue[i] is None and not all_data_processed:

                queue[i] = Queue()
                process[i] = Process(target=compute_consensus_matrix,
                                     kwargs=dict(
                                         pMatrixName=args.matrix,
                                         pClusterMatricesList=clusters[count],
                                         pClusterName=count,
                                         pQueue=queue[i]))
                process[i].start()
                thread_done[i] = False
                count += 1
                if count >= len(clusters):
                    all_data_processed = True

            elif queue[i] is not None and not queue[i].empty():
                log.debug('Get data!')
                matrixFileHandlerObjects_list.append(queue[i].get())
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True

                log.debug('all_data_processed {}'.format(all_data_processed))
                log.debug('all_threads_done {}'.format(all_threads_done))
                log.debug('queue {}'.format(queue))
                log.debug('process {}'.format(process))
                log.debug('thread_done {}'.format(thread_done))
                log.debug('count {}'.format(count))
            elif all_data_processed and queue[i] is None:
                thread_done[i] = True
            else:
                time.sleep(1)

        if all_data_processed:
            all_threads_done = True
            for thread in thread_done:
                if not thread:
                    all_threads_done = False

    sum_of_all = []
    for i, matrixFileHandler in enumerate(matrixFileHandlerObjects_list):
        sum_of_all.append(matrixFileHandler.matrixFile.matrix.sum())

    if args.no_normalization:
        argmin = np.argmin(sum_of_all)

        for i, matrixFileHandler in enumerate(matrixFileHandlerObjects_list):
            matrixFileHandler.matrixFile.matrix.data = matrixFileHandler.matrixFile.matrix.data.astype(
                np.float32)
            if i != argmin:
                mask = np.isnan(matrixFileHandler.matrixFile.matrix.data)
                matrixFileHandler.matrixFile.matrix.data[mask] = 0

                mask = np.isinf(matrixFileHandler.matrixFile.matrix.data)
                matrixFileHandler.matrixFile.matrix.data[mask] = 0
                adjust_factor = sum_of_all[i] / sum_of_all[argmin]
                matrixFileHandler.matrixFile.matrix.data /= adjust_factor
                mask = np.isnan(matrixFileHandler.matrixFile.matrix.data)

            mask = np.isnan(matrixFileHandler.matrixFile.matrix.data)
            matrixFileHandler.matrixFile.matrix.data[mask] = 0

            mask = np.isinf(matrixFileHandler.matrixFile.matrix.data)
            matrixFileHandler.matrixFile.matrix.data[mask] = 0
            matrixFileHandler.matrixFile.matrix.eliminate_zeros()

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrixFileHandlerObjects_list
    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
Ejemplo n.º 17
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)
    ma.matrix.data[np.isnan(ma.matrix.data)] = 0
    ma.maskBins(ma.nan_bins)
    ma.matrix.data = ma.matrix.data
    new_intervals = hicexplorer.utilities.enlarge_bins(ma.cut_intervals)
    ma.setCutIntervals(new_intervals)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    default_range = '1000000:20000000'
    if args.range is None:
        if args.mode == "intra-chr":
            log.warning("You have not set any range. This is by default set to {} for intra-chr.".format(default_range))
        args.range = default_range
    min_dist, max_dist = args.range.split(":")
    log.info("checking range {}-{}".format(min_dist, max_dist))
    assert int(min_dist) < int(max_dist), "Error lower range is larger than upper range!"
    if args.transform == "z-score":  # use zscore matrix
        log.info("Computing z-score matrix. This may take a while.\n")
        if args.mode == 'intra-chr':
            ma.convert_to_zscore_matrix(maxdepth=int(max_dist) * 2.5, perchr=True)
        else:
            ma.convert_to_zscore_matrix(maxdepth=None, perchr=True)
    elif args.transform == "obs/exp":  # use obs/exp matrix
        log.info("Computing observed vs. expected matrix. This may take a while.\n")
        if args.mode == 'intra-chr':
            ma.convert_to_obs_exp_matrix(maxdepth=int(max_dist) * 2.5, perchr=True)
        else:
            ma.convert_to_obs_exp_matrix(maxdepth=None, perchr=True)
        if args.outFileObsExp:
            file_type = 'cool'
            if args.outFileObsExp.endswith('.h5'):
                file_type = 'h5'
            matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
            matrixFileHandlerOutput.set_matrix_variables(ma.matrix,
                                                         ma.cut_intervals,
                                                         ma.nan_bins,
                                                         ma.correction_factors,
                                                         ma.distance_counts)
            matrixFileHandlerOutput.save(args.outFileObsExp, pSymmetric=True, pApplyCorrection=False)

    M = args.numberOfBins if args.numberOfBins % 2 == 1 else args.numberOfBins + 1
    M_half = int((M - 1) // 2)

    chrom_coord = dict()
    chrom_list = ma.getChrNames()
    for chrom in chrom_list:
        first, last = ma.getChrBinRange(chrom)
        first = ma.getBinPos(first)
        last = ma.getBinPos(last - 1)
        chrom_coord[chrom] = (first[1], last[2])

    agg_info = dict()
    agg_info["chrom_coord"] = chrom_coord
    agg_info["seen"] = []
    agg_info["agg_matrix"] = OrderedDict()
    agg_info["agg_total"] = {}
    agg_info["agg_diagonals"] = OrderedDict()
    agg_info["agg_contact_position"] = {}
    agg_info["agg_center_values"] = {}
    agg_info["counter"] = 0
    agg_info["used_counter"] = 0
    agg_info["empty_mat"] = 0
    if (args.mode == 'inter-chr') and (len(agg_info["chrom_coord"]) == 1):
        exit("Error: 'inter-chr' mode can not be applied on matrices of only one chromosme.")
    if args.row_wise:
        # read bed files
        bed_intervals = args.BED.readlines()
        if args.BED2:
            bed_intervals2 = args.BED2.readlines()
        else:
            log.error("Error computing row-wise contacts requires two bed files!")
            exit("Error computing row-wise contacts requires two bed files!")
        # agg_matrix could be either per chromosome or genome wide
        aggregate_contacts_per_row(bed_intervals, bed_intervals2, agg_info, ma, chrom_list, M_half, args.largeRegionsOperation, args.range, args.transform, mode=args.mode, perChr=args.perChr)
    else:  # not row-wise
        # read and sort bed files.
        bed_intervals = read_bed_per_chrom(args.BED, chrom_list)
        if args.BED2:
            bed_intervals2 = read_bed_per_chrom(args.BED2, chrom_list)
        else:
            bed_intervals2 = bed_intervals
        # agg_matrix could be either per chromosome or genome wide
        aggregate_contacts(bed_intervals, bed_intervals2, agg_info, ma, M_half, args.largeRegionsOperation, args.range, args.transform, mode=args.mode, perChr=args.perChr)

    if args.kmeans is not None:
        cluster_ids = cluster_matrices(agg_info["agg_matrix"], args.kmeans, method='kmeans', how=args.howToCluster)
        num_clusters = args.kmeans
    elif args.hclust is not None:
        log.info("Performing hierarchical clustering."
                 "Please note that it might be very slow for large datasets.\n")
        cluster_ids = cluster_matrices(agg_info["agg_matrix"], args.hclust, method='hierarchical',
                                       how=args.howToCluster)
        num_clusters = args.hclust
    else:
        # make a 'fake' clustering to generalize the plotting of the submatrices
        cluster_ids = {}
        num_clusters = 1
        for k in agg_info["agg_matrix"].keys():
            cluster_ids[k] = [range(len(agg_info["agg_matrix"][k]))]
    if len(agg_info["agg_matrix"]) == 0:
        exit("No susbmatrix found to be aggregated.")
    plot_aggregated_contacts(agg_info["agg_matrix"], agg_info["agg_contact_position"], cluster_ids, num_clusters, M_half, args)

    if args.outFileContactPairs:
        for idx, chrom in enumerate(agg_info["agg_matrix"]):
            if chrom not in bed_intervals or chrom not in bed_intervals2:
                continue
            for cluster_number, cluster_indices in enumerate(cluster_ids[chrom]):
                center_values_to_order = np.array(agg_info["agg_center_values"][chrom])[cluster_indices]
                center_values_order = np.argsort(center_values_to_order)[::-1]

                output_name = "{file}_{chrom}_cluster_{id}.tab".format(file=args.outFileContactPairs,
                                                                       chrom=chrom, id=cluster_number + 1)
                with open(output_name, 'w') as fh:
                    for cl_idx in center_values_order:
                        value = center_values_to_order[cl_idx]
                        start, end, start2, end2 = agg_info["agg_contact_position"][chrom][cl_idx]
                        fh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(chrom, start, end, chrom, start2, end2, value))

    # plot the diagonals
    # the diagonals plot is useful to see individual cases and if they had a contact in the center
    if args.diagnosticHeatmapFile:
        plot_diagnostic_heatmaps(agg_info["agg_diagonals"], cluster_ids, M_half, args)
Ejemplo n.º 18
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    threads = args.threads
    merged_matrices = [None] * threads
    matrices_list = cell_name_list(args.matrix)
    if len(matrices_list) < threads:
        threads = len(matrices_list)
    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_merge,
                             kwargs=dict(pMatrixName=args.matrix,
                                         pMatrixList=matrices_name_list,
                                         pRunningWindow=args.runningWindow,
                                         pNumBins=args.numBins,
                                         pQueue=queue[i]))

        process[i].start()
    fail_flag = False
    fail_message = ''
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                # log.debug('i {}'.format(i))
                # log.debug('len(queue) {}'.format(len(queue)))
                # log.debug('len(merged_matrices) {}'.format(len(merged_matrices)))

                merged_matrices[i] = queue[i].get()
                if isinstance(
                        merged_matrices[i][0],
                        str) and merged_matrices[i][0].startswith('Fail: '):
                    fail_flag = True
                    fail_message = merged_matrices[i][0]
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
            time.sleep(1)

    if fail_flag:
        log.error('{}'.format(fail_message))
        exit(1)
    matrixFileHandlerObjects_list = [
        item for sublist in merged_matrices for item in sublist
    ]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrixFileHandlerObjects_list
    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
Ejemplo n.º 19
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        trasf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        trasf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.norm:
            exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome,
                                                  chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        else:
            exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        if args.obsexpMatrix:
            trasf_matrix_obsexp[chr_range[0]:chr_range[1],
                                chr_range[0]:chr_range[1]] = lil_matrix(
                                    exp_obs_matrix_)

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.geneTrack:
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list,
                                                      args.geneTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list,
                          _start_list,
                          ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
Ejemplo n.º 20
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)
    matrix_file_handler_object_list = []

    # read genome sizes
    chromosome_sizes = {}
    genome_size = 0
    matrix_dimensions = 0
    with open(args.chromosomeSizes, 'r') as file:
        for i, line in enumerate(file.readlines()):
            line = line.strip()
            chromosome_name, chromosome_size = line.split('\t')
            chromosome_sizes[chromosome_name] = int(chromosome_size)
            genome_size += int(chromosome_size)
            matrix_dimensions += ((int(chromosome_size) // args.resolution) +
                                  1)

    log.debug('genome_size {}'.format(genome_size))
    log.debug('args.resolution {}'.format(args.resolution))

    log.debug('matrix_dimensions {}'.format(matrix_dimensions))

    log.debug('chromosome_sizes {}'.format(chromosome_sizes))
    # create cut_intervals:
    cut_intervals = []

    for chromosome, size in chromosome_sizes.items():
        for interval in range(0, size, args.resolution):
            cut_intervals.append(
                (chromosome, interval, min(size,
                                           interval + args.resolution), 1))

    matrices_list = args.matrices

    threads = args.threads

    matrixFileHandler_list = [None] * args.threads
    process = [None] * args.threads
    queue = [None] * args.threads

    thread_done = [False] * args.threads
    matricesPerThread = len(matrices_list) // threads

    for i in range(args.threads):
        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=txt_to_matrixFileHandler,
                             kwargs=dict(pMatricesList=matrices_name_list,
                                         pMatrixDimensions=matrix_dimensions,
                                         pCutIntervals=cut_intervals,
                                         pQueue=queue[i]))
        process[i].start()

    all_data_collected = False
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    matrix_file_handler_object_list = [
        item for sublist in matrixFileHandler_list for item in sublist
    ]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
Ejemplo n.º 21
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    matplotlib.rcParams['pdf.fonttype'] = 42

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)
    ma.matrix.data[np.isnan(ma.matrix.data)] = 0
    ma.maskBins(ma.nan_bins)
    ma.matrix.data = ma.matrix.data
    new_intervals = hicexplorer.utilities.enlarge_bins(ma.cut_intervals)
    ma.setCutIntervals(new_intervals)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    default_range = '1000000:20000000'
    if args.range is not None:
        if (args.mode == "inter-chr") or (args.mode == "all"):
            log.info("--range is ineffective for inter-chr and all mode.")
    if args.range is None:
        if args.mode == "intra-chr":
            log.warning("You have not set any range. This is by default set to {} for intra-chr.".format(default_range))
        args.range = default_range
    min_dist, max_dist = args.range.split(":")
    if args.mode == "intra-chr":
        log.info("checking range {}-{}".format(min_dist, max_dist))
        assert int(min_dist) < int(max_dist), "Error lower range is larger than upper range!"
    if args.transform == "z-score":  # use zscore matrix
        log.info("Computing z-score matrix. This may take a while.\n")
        if args.mode == 'intra-chr':
            ma.convert_to_zscore_matrix(maxdepth=int(max_dist) * 2.5, perchr=True)
        else:
            ma.convert_to_zscore_matrix(maxdepth=None, perchr=True)
    elif args.transform == "obs/exp":  # use obs/exp matrix
        log.info("Computing observed vs. expected matrix. This may take a while.\n")
        if args.mode == 'intra-chr':
            ma.convert_to_obs_exp_matrix(maxdepth=int(max_dist) * 2.5, perchr=True)
        else:
            ma.convert_to_obs_exp_matrix(maxdepth=None, perchr=True)
        if args.outFileObsExp:
            file_type = 'cool'
            if args.outFileObsExp.endswith('.h5'):
                file_type = 'h5'
            matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
            matrixFileHandlerOutput.set_matrix_variables(ma.matrix,
                                                         ma.cut_intervals,
                                                         ma.nan_bins,
                                                         ma.correction_factors,
                                                         ma.distance_counts)
            matrixFileHandlerOutput.save(args.outFileObsExp, pSymmetric=True, pApplyCorrection=False)

    M = args.numberOfBins if args.numberOfBins % 2 == 1 else args.numberOfBins + 1
    M_half = int((M - 1) // 2)

    chrom_coord = dict()
    chrom_list = ma.getChrNames()
    for chrom in chrom_list:
        first, last = ma.getChrBinRange(chrom)
        first = ma.getBinPos(first)
        last = ma.getBinPos(last - 1)
        chrom_coord[chrom] = (first[1], last[2])

    agg_info = dict()
    agg_info["chrom_coord"] = chrom_coord  # coordinates of each chrom
    agg_info["seen"] = []  # seen bins
    agg_info["agg_matrix"] = {chrom: {} for chrom in chrom_list}  # important
    agg_info["agg_total"] = {chrom: {} for chrom in chrom_list}
    agg_info["agg_diagonals"] = {chrom: {} for chrom in chrom_list}
    agg_info["agg_contact_position"] = {chrom: {} for chrom in chrom_list}  # important
    agg_info["agg_center_values"] = {chrom: {} for chrom in chrom_list}  # important
    agg_info["counter"] = 0
    agg_info["used_counter"] = 0
    agg_info["empty_mat"] = 0

    log.debug('agg_info["agg_matrix"] {}'.format(agg_info["agg_matrix"]))
    if (args.mode == 'inter-chr') and (len(agg_info["chrom_coord"]) == 1):
        exit("Error: 'inter-chr' mode can not be applied on matrices of only one chromosme.")
    if (args.mode == 'inter-chr') and (args.perChr):
        exit("Error: 'inter-chr' mode can not be used along with --perChr.")
    if (args.mode == 'all') and (args.perChr):
        exit("Error: 'all' mode can not be used along with --perChr.")
    if args.row_wise:
        # read bed files
        bed_intervals = args.BED.readlines()
        if args.BED2:
            bed_intervals2 = args.BED2.readlines()
        else:
            log.error("Error computing row-wise contacts requires two bed files!")
            exit("Error computing row-wise contacts requires two bed files!")
        if len(bed_intervals) != len(bed_intervals2):
            log.error("row_wise only works if both bed files have the same length.")
            exit("Error row_wise only works if both bed files have the same length.")
        # agg_matrix could be either per chromosome or genome wide
        aggregate_contacts_per_row(bed_intervals, bed_intervals2, agg_info, ma, chrom_list,
                                   M_half, args.largeRegionsOperation, args.range,
                                   args.transform, mode=args.mode, perChr=args.perChr, pConsiderStrandDirection=args.considerStrandDirection)
    else:  # not row-wise
        # read and sort bed files.
        bed_intervals = read_bed_per_chrom(args.BED, chrom_list, args.considerStrandDirection)
        if args.BED2:
            bed_intervals2 = read_bed_per_chrom(args.BED2, chrom_list, args.considerStrandDirection)
        else:
            bed_intervals2 = bed_intervals
        # agg_matrix could be either per chromosome or genome wide
        aggregate_contacts(bed_intervals, bed_intervals2, agg_info, ma, M_half,
                           args.largeRegionsOperation, args.range, args.transform,
                           mode=args.mode, pConsiderStrandDirection=args.considerStrandDirection)
    if len(agg_info["agg_matrix"]) == 0:
        exit("No susbmatrix found to be aggregated.")

    if args.kmeans is not None:
        assert(args.kmeans > 1)
        if args.perChr == True:
            clustered_info = cluster_matrices(agg_info,
                                              k=args.kmeans, method='kmeans', how=args.howToCluster,
                                              perChr=args.perChr, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        else:
            clustered_info = cluster_matrices(agg_info,
                                              k=args.kmeans, method='kmeans', how=args.howToCluster,
                                              perChr=False, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        num_clusters = args.kmeans
    elif args.hclust is not None:
        assert(args.hclust > 1)
        log.info("Performing hierarchical clustering."
                 "Please note that it might be very slow for large datasets.\n")
        if args.perChr == True:
            clustered_info = cluster_matrices(agg_info,
                                              k=args.hclust, method='hierarchical',
                                              how=args.howToCluster,
                                              perChr=args.perChr, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        else:
            clustered_info = cluster_matrices(agg_info,
                                              k=args.hclust, method='hierarchical',
                                              how=args.howToCluster,
                                              perChr=False, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        num_clusters = args.hclust
    else:
        # make a 'fake' clustering to generalize the plotting of the submatrices
        k = 1
        if args.perChr == True:
            clustered_info = cluster_matrices(agg_info, k=k, method='no_clust',
                                              how='full', perChr=args.perChr, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)

        else:
            clustered_info = cluster_matrices(agg_info, k=k, method='no_clust',
                                              how='full', perChr=False, max_deviation=args.max_deviation,
                                              keep_outlier=args.keep_outlier)
        num_clusters = k

    plot_aggregated_contacts(clustered_info, num_clusters, M_half, args)

    # plot the diagonals
    # the diagonals plot is useful to see individual cases and if they had a contact in the center
    if args.diagnosticHeatmapFile:
        plot_diagnostic_heatmaps(clustered_info, M_half, args)
Ejemplo n.º 22
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)
    matrix_file_handler_object_list = []

    matrices_list = cell_name_list(args.matrix)
    if args.action in ['extractToCool', 'extractScool']:
        if args.cellList is not None:
            matrix_list_tmp = []
            with open(args.cellList, 'r') as file:
                for line in file:
                    values = line.strip()
                    log.debug('values {}'.format(values))
                    if not values.startswith('/cells'):
                        values = '/cells/' + values
                    if values in matrices_list:
                        matrix_list_tmp.append(values)

            matrices_list = matrix_list_tmp

    if len(matrices_list) == 0:
        raise OSError('No cells for processing. Terminating.')
        exit(1)
    if len(matrices_list) < args.threads:
        args.threads = len(matrices_list)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=args.matrix + "::" +
                                               matrices_list[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    threads = args.threads

    matrixFileHandler_list = [None] * args.threads
    process = [None] * args.threads
    queue = [None] * args.threads

    thread_done = [False] * args.threads
    matricesPerThread = len(matrices_list) // threads

    for i in range(args.threads):
        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=load_cool_files,
                             kwargs=dict(pMatrixName=args.matrix,
                                         pMatricesList=matrices_name_list,
                                         pCutIntervals=cut_intervals_all,
                                         pQueue=queue[i]))
        process[i].start()

    all_data_collected = False
    fail_flag = False
    fail_message = ''
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                if 'Fail:' in matrixFileHandler_list[i]:
                    fail_flag = True
                    fail_message = matrixFileHandler_list[i][6:]
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    if fail_flag:
        log.error(fail_message)
        exit(1)
    matrix_file_handler_object_list = [
        item for sublist in matrixFileHandler_list for item in sublist
    ]

    if args.action in ['extractScool', 'update']:
        matrixFileHandler = MatrixFileHandler(pFileType='scool')
        matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
        matrixFileHandler.save(args.outFileName,
                               pSymmetric=True,
                               pApplyCorrection=False)
    else:
        if not os.path.exists(args.outFileName):
            try:
                os.makedirs(args.outFileName)
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        for matrixFileHandler in matrix_file_handler_object_list:
            matrixFileHandler.save(
                args.outFileName + '/' +
                matrixFileHandler.matrixFile.matrixFileName + '.cool',
                pApplyCorrection=True,
                pSymmetric=True)
Ejemplo n.º 23
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cell_name_list(matrices_name)
    if args.createSubmatrix is not None and args.regions is None and args.chromosomes is None:
        for matrix in matrices_list[:args.createSubmatrix]:
            cooler.fileops.cp(args.matrix + '::' + matrix,
                              args.outFileName + '::' + matrix)
        exit(0)

    input_count_matrices = len(matrices_list)
    if threads > len(matrices_list):
        threads = len(matrices_list)

    # load bin ids only once
    cooler_obj_external = cooler.Cooler(matrices_name + '::' +
                                        matrices_list[0])
    bins = cooler_obj_external.bins()[:]

    # apply the inverted operation if the number of values is less
    # the idea is that for
    # indices = pixels['bin1_id'].apply(lambda x: x in pListIds)
    # the search time is less if the list pListIds is shorter
    # therefore the drop must be inverted too
    apply_inverted = False
    if args.action == 'keep':
        list_ids = bins.index[bins['chrom'].apply(
            lambda x: x in args.chromosomes)].tolist()
        list_inverted_logic_ids = bins.index[bins['chrom'].apply(
            lambda x: x not in args.chromosomes)].tolist()

        bins_new = bins[bins['chrom'].apply(
            lambda x: x in args.chromosomes)].reset_index()

    else:
        list_ids = bins.index[bins['chrom'].apply(
            lambda x: x not in args.chromosomes)].tolist()
        list_inverted_logic_ids = bins.index[bins['chrom'].apply(
            lambda x: x in args.chromosomes)].tolist()
        bins_new = bins[bins['chrom'].apply(
            lambda x: x not in args.chromosomes)].reset_index()

    if len(list_inverted_logic_ids) < len(list_ids):
        apply_inverted = True
        list_ids = list_inverted_logic_ids

    dict_values = bins_new['index'].to_dict()
    inv_map = {}
    for k, v in dict_values.items():
        if k == v:
            continue
        inv_map[v] = k
    bins_new.drop(['index'], axis=1, inplace=True)

    all_data_collected = False
    thread_done = [False] * threads
    pixels_thread = [None] * threads
    keep_thread = [None] * threads

    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_adjust_matrix,
                             kwargs=dict(pMatrixName=matrices_name,
                                         pMatricesList=matrices_name_list,
                                         pArgs=args,
                                         pListIds=list_ids,
                                         pInvertedMap=inv_map,
                                         pInvertedLogic=apply_inverted,
                                         pQueue=queue[i]))

        process[i].start()
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                pixels_thread[i], keep_thread[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    pixels_list = [item for sublist in pixels_thread for item in sublist]
    keep_list = [item for sublist in keep_thread for item in sublist]

    matrices_list = np.array(matrices_list)
    mask = np.array(keep_list)
    matrices_list = matrices_list[mask]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.bins = bins_new
    matrixFileHandler.matrixFile.pixel_list = pixels_list
    matrixFileHandler.matrixFile.name_list = matrices_list

    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
    broken_count = input_count_matrices - np.sum(np.array(keep_list))
    print(
        'Out of {} matrices, {} were removed because they were broken.'.format(
            input_count_matrices, broken_count))