Ejemplo n.º 1
0
def compute_merge(pMatrixName, pMatrixList, pRunningWindow, pNumBins, pQueue):

    out_queue_list = []
    try:
        for matrix in pMatrixList:
            hic = hm.hiCMatrix(pMatrixName + '::' + matrix)

            if pRunningWindow:
                merged_matrix = running_window_merge(hic, pNumBins)
            else:
                merged_matrix = merge_bins(hic, pNumBins)

            matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool',
                                                        pMatrixFile=matrix,
                                                        pEnforceInteger=False,
                                                        pFileWasH5=False)

            matrixFileHandlerOutput.set_matrix_variables(
                merged_matrix.matrix, merged_matrix.cut_intervals,
                merged_matrix.nan_bins, merged_matrix.correction_factors,
                merged_matrix.distance_counts)
            out_queue_list.append(matrixFileHandlerOutput)

        pQueue.put(out_queue_list)
    except Exception as exp:
        pQueue.put(["Fail: {}".format(str(exp))])
    return
Ejemplo n.º 2
0
def compute_merge(pMatrixName, pMatrixList, pRunningWindow, pNumBins, pQueue):

    out_queue_list = []
    for matrix in pMatrixList:
        hic = hm.hiCMatrix(pMatrixName + '::' + matrix)

        if pRunningWindow:
            merged_matrix = running_window_merge(hic, pNumBins)
        else:
            merged_matrix = merge_bins(hic, pNumBins)

        out_queue_list.append(merged_matrix)

    pQueue.put(out_queue_list)
    return
Ejemplo n.º 3
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!: Input matrices {}; output matrices {}'
                .format(len(args.matrices), len(args.outFileName)))
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat != 'cool':
        log.error('The export of a hic file is only possible to a cool file.')
        exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool', '2D-text']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        if args.inputFormat == '2D-text':
            if args.resolutions is None:
                log.error('The resolution must be defined via --resolutions')
                sys.exit(1)
            if args.chromosomeSizes is None:
                log.error(
                    'The sizes of the chromosomes must be defined via --chromosomeSizes.'
                )
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()
            elif args.inputFormat == '2D-text':
                chrom_sizes = OrderedDict()
                size_genome = 0
                with open(args.chromosomeSizes.name, 'r') as file:
                    file_ = True
                    while file_:
                        file_ = file.readline().strip()
                        if file_ != '':
                            line_split = file_.split('\t')
                            chrom_sizes[line_split[0]] = int(line_split[1])
                            size_genome += int(line_split[1])
                chrom_sizes = list(chrom_sizes.items())

                # log.debug('chrom_sizes: {}'.format(chrom_sizes))
                args.resolutions = [int(x) for x in args.resolutions]
                # internal_matrix_size = size_genome // args.resolutions[0]

                cut_intervals = []
                for chromosome in chrom_sizes:
                    for interval in range(0, chromosome[1],
                                          args.resolutions[0]):
                        cut_intervals.append(
                            tuple([
                                chromosome[0], interval,
                                min(chromosome[1],
                                    interval + args.resolutions[0]), 1.0
                            ]))

                hic_matrix_csr = lil_matrix(
                    (len(cut_intervals), len(cut_intervals)))
                log.debug('cut_intervals {}'.format(cut_intervals[:20]))

                hic_matrix = HiCMatrix.hiCMatrix()
                hic_matrix.setMatrix(hic_matrix_csr, cut_intervals)
                # tmp_matrix = coo_matrix(())
                with open(matrix, 'r') as file:
                    for j, line in enumerate(file):
                        line_split = line.split('\t')
                        chromosome_1 = str(line_split[0])
                        start_1 = int(line_split[1])
                        end_1 = int(line_split[2])

                        chromosome_2 = str(line_split[3])
                        start_2 = int(line_split[4])
                        end_2 = int(line_split[5])

                        value = float(line_split[6])
                        bin_id_1 = hic_matrix.getRegionBinRange(
                            chromosome_1, start_1, end_1)
                        bin_id_2 = hic_matrix.getRegionBinRange(
                            chromosome_2, start_2, end_2)
                        try:
                            hic_matrix.matrix[bin_id_1, bin_id_2] = value
                        except Exception as exp:
                            log.debug(str(exp))
                        if j % 1000 == 0:
                            log.debug('{} lines computed'.format(j))
                log.debug('csr with values filled!')
                hic_matrix.matrix = hic_matrix.matrix.tocsr()

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, \
                    hic_matrix.distance_counts, hic_matrix.correction_factors

            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

                _matrix, cut_intervals, nan_bins, \
                    distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('cut_intervals {}'.format(cut_intervals[:20]))

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                log.debug('cool h5 homer ginteractions hicpro branch')

                if args.outputFormat in ['homer', 'ginteractions']:
                    log.debug('homer ginteractions branch')

                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                cool_metadata = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                    cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata

                log.debug('cool_metadata {}'.format(cool_metadata))
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion,
                    pHiCInfo=cool_metadata)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                log.debug('len(args.outFileName) {}, i {}'.format(
                    len(args.outFileName), i))
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)

            if args.outputFormat == 'hicpro':
                log.debug('hicpro branch')
                if len(args.matrices) == len(args.outFileName) and len(
                        args.outFileName) == len(args.bedFileHicpro):
                    log.debug('args.bedFileHicpro[i] {}'.format(
                        args.bedFileHicpro[i]))
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType=args.outputFormat,
                        pBedFileHicPro=args.bedFileHicpro[i])

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[i],
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
                else:
                    log.error(
                        'The number of input matrices, output files and bed files does not match: Input: {}; Output: {}; Bed: {}'
                        .format(len(args.matrix), len(args.outFileName),
                                len(args.bedFileHicpro)))
                    exit(1)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()
                    hic2CoolVersion = None
                    cool_metadata = None
                    if args.inputFormat == 'cool':
                        hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                        cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata
                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5,
                            pHic2CoolVersion=hic2CoolVersion,
                            pHiCInfo=cool_metadata)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
Ejemplo n.º 4
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                if args.outputFormat in ['homer', 'ginteractions']:
                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                hic2CoolVersion = None
                if args.inputFormat == 'cool':
                    hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5,
                    pHic2CoolVersion=hic2CoolVersion)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()

                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
Ejemplo n.º 5
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:
                out_name = args.outFileName[i].split('.')
                out_name[-2] = split_name[-2] + '_' + str(resolution)
                out_name = '.'.join(out_name)
                for resolution in args.resolutions:
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer)

            _matrix, cut_intervals, nan_bins, \
                correction_factors, distance_counts = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i] + '.' +
                                             args.outputFormat,
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define either one matrix and many resolutions which should be created.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()

                    for resolution in args.resolutions:
                        _mergeFactor = int(resolution) // bin_size
                        merged_matrix = hicMergeMatrixBins.merge_bins(
                            hic_matrix, _mergeFactor)
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer)
                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '.mcool' +
                            '::/resolutions/' + str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool')
                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '.mcool' + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)