Python HiCMatrix Examples

Programming Language: Python

Namespace/Package Name: hicmatrix

Class/Type: HiCMatrix

Examples at hotexamples.com: 30

Python HiCMatrix - 30 examples found. These are the top rated real world Python examples of hicmatrix.HiCMatrix extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

hiCMatrix(30)

Frequently Used Methods

hiCMatrix (30)

Example #1

Show file

File: hicDifferentialTAD.py Project: ryys1122/HiCExplorer

def computeDifferentialTADs(pMatrixTarget, pMatrixControl, pDomainList,
                            pCoolOrH5, pPValue, pThreadId, pQueue):
    accepted_inter_left = []
    accepted_inter_right = []
    accepted_intra = []
    p_values_list = []
    rows = []

    for i, row in enumerate(pDomainList):

        if pThreadId is None:
            log.debug('first thread')
            if i == len(pDomainList) - 1:
                continue
        elif pThreadId == True:
            log.debug('middle thread')

            if i == 0 or i == len(pDomainList) - 1:
                continue
        elif pThreadId == False:
            log.debug('last thread')

            if i == 0:
                continue

        if i - 1 >= 0:
            chromosom = pDomainList[i - 1][0]
            start = pDomainList[i - 1][1]
        else:
            chromosom = pDomainList[i][0]
            start = pDomainList[i][1]
        if i + 1 < len(pDomainList):
            end = pDomainList[i + 1][2]
        else:
            end = pDomainList[i][2]
        # midpos = row[1] + ((row[2] - row[1]) / 2)

        if pCoolOrH5:

            # # get intra-TAD data
            hic_matrix_target = hm.hiCMatrix(
                pMatrixFile=pMatrixTarget,
                pChrnameList=[
                    str(row[0]) + ':' + str(row[1]) + '-' + str(row[2])
                ])
            hic_matrix_control = hm.hiCMatrix(
                pMatrixFile=pMatrixControl,
                pChrnameList=[
                    str(row[0]) + ':' + str(row[1]) + '-' + str(row[2])
                ])
            matrix_target = hic_matrix_target.matrix.toarray()
            matrix_control = hic_matrix_control.matrix.toarray()

            hic_matrix_target_inter_tad = hm.hiCMatrix(
                pMatrixFile=pMatrixTarget,
                pChrnameList=[
                    str(chromosom) + ':' + str(start) + '-' + str(end)
                ])
            hic_matrix_control_inter_tad = hm.hiCMatrix(
                pMatrixFile=pMatrixControl,
                pChrnameList=[
                    str(chromosom) + ':' + str(start) + '-' + str(end)
                ])

            matrix_target_inter_tad = hic_matrix_target_inter_tad.matrix
            matrix_control_inter_tad = hic_matrix_control_inter_tad.matrix

        else:
            # in case of h5 pMatrixTarget is already a HiCMatrix object
            hic_matrix_target = pMatrixTarget
            hic_matrix_control = pMatrixControl
            hic_matrix_target_inter_tad = pMatrixTarget
            hic_matrix_control_inter_tad = pMatrixControl
            indices_target = hic_matrix_target.getRegionBinRange(
                str(row[0]), row[1], row[2])
            indices_control = hic_matrix_control.getRegionBinRange(
                str(row[0]), row[1], row[2])

            matrix_target = hic_matrix_target.matrix[
                indices_target[0]:indices_target[1],
                indices_target[0]:indices_target[1]].toarray()
            matrix_control = hic_matrix_control.matrix[
                indices_control[0]:indices_control[1],
                indices_control[0]:indices_control[1]].toarray()
            matrix_target_inter_tad = pMatrixTarget.matrix
            matrix_control_inter_tad = pMatrixControl.matrix

        matrix_target = matrix_target.flatten()
        matrix_control = matrix_control.flatten()
        # tad_midpoint = hic_matrix_target_inter_tad.getRegionBinRange(str(row[0]), midpos, midpos)[0]

        # if i - 1 >= 0:
        # get index position left tad with tad
        left_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
            str(chromosom), row[1], row[1])[0]
        left_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
            str(chromosom), row[1], row[1])[0]
        if pCoolOrH5:
            outer_left_boundary_index_target = 0
            outer_left_boundary_index_control = 0

            outer_right_boundary_index_control = -1
            outer_right_boundary_index_target = -1

        else:
            outer_left_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
                str(chromosom), start, end)[0]
            outer_left_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
                str(chromosom), start, end)[0]

            outer_right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
                str(chromosom), start, end)[1]
            outer_right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
                str(chromosom), start, end)[1]

        if i + 1 < len(pDomainList) and not pCoolOrH5:
            # get index position left tad with tad
            right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
                str(chromosom), row[2], row[2])[0]
            right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
                str(chromosom), row[2], row[2])[0]
        elif i + 1 < len(pDomainList) - 1:
            right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
                str(chromosom), row[2], row[2])[0]
            right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
                str(chromosom), row[2], row[2])[0]

        if i - 1 >= 0 and i + 1 < len(pDomainList):
            intertad_left_target = matrix_target_inter_tad[
                outer_left_boundary_index_target:left_boundary_index_target,
                left_boundary_index_target:
                right_boundary_index_target].toarray()
            intertad_right_target = matrix_target_inter_tad[
                left_boundary_index_target:right_boundary_index_target,
                right_boundary_index_target:
                outer_right_boundary_index_target].toarray()
            intertad_left_control = matrix_control_inter_tad[
                outer_left_boundary_index_control:left_boundary_index_control,
                left_boundary_index_control:
                right_boundary_index_control].toarray()
            intertad_right_control = matrix_control_inter_tad[
                left_boundary_index_control:right_boundary_index_control,
                right_boundary_index_control:
                outer_right_boundary_index_control].toarray()

        elif i - 1 < 0 and i + 1 < len(pDomainList):
            intertad_right_target = matrix_target_inter_tad[
                left_boundary_index_target:right_boundary_index_target,
                right_boundary_index_target:
                outer_right_boundary_index_target].toarray()
            intertad_right_control = matrix_control_inter_tad[
                left_boundary_index_control:right_boundary_index_control,
                right_boundary_index_control:
                outer_right_boundary_index_control].toarray()

        elif i - 1 > 0 and i + 1 >= len(pDomainList):
            intertad_left_target = matrix_target_inter_tad[
                outer_left_boundary_index_target:left_boundary_index_target,
                left_boundary_index_target:
                right_boundary_index_target].toarray()
            intertad_left_control = matrix_control_inter_tad[
                outer_left_boundary_index_control:left_boundary_index_control,
                left_boundary_index_control:
                right_boundary_index_control].toarray()

        significance_level_left = None
        significance_level_right = None
        statistic_left = None
        statistic_right = None

        if i - 1 >= 0 and i + 1 < len(pDomainList):
            intertad_left_target = intertad_left_target.flatten()
            intertad_left_control = intertad_left_control.flatten()
            intertad_right_target = intertad_right_target.flatten()
            intertad_right_control = intertad_right_control.flatten()

            statistic_left, significance_level_left = ranksums(
                intertad_left_target, intertad_left_control)
            statistic_right, significance_level_right = ranksums(
                intertad_right_target, intertad_right_control)
        elif i - 1 < 0 and i + 1 < len(pDomainList):
            intertad_right_target = intertad_right_target.flatten()
            intertad_right_control = intertad_right_control.flatten()
            statistic_right, significance_level_right = ranksums(
                intertad_right_target, intertad_right_control)
        elif i - 1 > 0 and i + 1 >= len(pDomainList):
            intertad_left_target = intertad_left_target.flatten()
            intertad_left_control = intertad_left_control.flatten()
            log.debug('intertad_left_target {}'.format(intertad_left_target))
            log.debug('intertad_left_control {}'.format(intertad_left_control))

            statistic_left, significance_level_left = ranksums(
                intertad_left_target, intertad_left_control)

        # log.debug('matrix_target {}'.format(matrix_target))
        # log.debug('matrix_control {}'.format(matrix_control))

        statistic, significance_level = ranksums(matrix_target, matrix_control)
        log.debug('statistic {}, significance_level {}'.format(
            statistic, significance_level))
        log.debug('right statistic {}, significance_level {}'.format(
            statistic_right, significance_level_right))
        log.debug('left statistic {}, significance_level {}'.format(
            statistic_left, significance_level_left))

        p_values = []
        if significance_level_left is None or np.isnan(
                significance_level_left):
            accepted_inter_left.append(0)
            p_values.append(np.nan)
        elif significance_level_left <= pPValue:
            accepted_inter_left.append(1)
            p_values.append(significance_level_left)
        else:
            accepted_inter_left.append(0)
            p_values.append(significance_level_left)

        if significance_level_right is None or np.isnan(
                significance_level_right):
            accepted_inter_right.append(0)
            p_values.append(np.nan)
        elif significance_level_right <= pPValue:
            accepted_inter_right.append(1)
            p_values.append(significance_level_right)
        else:
            accepted_inter_right.append(0)
            p_values.append(significance_level_right)

        if significance_level is None or np.isnan(significance_level):
            accepted_intra.append(0)
            p_values.append(np.nan)
        elif significance_level <= pPValue:
            accepted_intra.append(1)
            p_values.append(significance_level)
        else:
            accepted_intra.append(0)
            p_values.append(significance_level)

        p_values_list.append(p_values)

        rows.append(row)
    # hic_matrix_target_inter_tad.save('manipulated_target.cool')
    # hic_matrix_control_inter_tad.save('manipulated_control.cool')
    pQueue.put([
        p_values_list, accepted_inter_left, accepted_inter_right,
        accepted_intra, rows
    ])

Example #2

Show file

File: scHicClusterCompartments.py Project: xjyx/scHiCExplorer

def open_and_store_matrix(pMatrixName, pMatricesList, pIndex, pXDimension,
                          pChromosomes, pNorm, pExtraTrack, pHistonMarkType,
                          pBinarization, pQueue):
    compartments_matrix = None

    for i, matrix in enumerate(pMatricesList):

        ma = hm.hiCMatrix(pMatrixName + '::' + matrix)

        # WARNING
        # DO NOT APPLY BIN MASKING, WILL LEAD TO DIFFERENT SIZES OF THE CHROMOSOMES
        # THIS IS CAUSING A FAIL OF THE COMPUTATION
        # ma.maskBins(ma.nan_bins)
        k = 1
        if pChromosomes:
            ma.keepOnlyTheseChr(pChromosomes)

        vecs_list = []
        chrom_list = []
        start_list = []
        end_list = []
        # PCA is computed per chromosome
        length_chromosome = 0
        chromosome_count = len(ma.getChrNames())

        for chrname in ma.getChrNames():
            chr_range = ma.getChrBinRange(chrname)
            length_chromosome += chr_range[1] - chr_range[0]

        if pExtraTrack and (pExtraTrack.endswith('.bw')
                            or pExtraTrack.endswith('.bigwig')):
            bwTrack = pyBigWig.open(pExtraTrack, 'r')

        for chrname in ma.getChrNames():
            chr_range = ma.getChrBinRange(chrname)
            submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                  chr_range[0]:chr_range[1]]
            if pNorm:
                obs_exp_matrix_ = obs_exp_matrix_norm(submatrix)

            else:
                obs_exp_matrix_ = obs_exp_matrix_lieberman(
                    submatrix, length_chromosome, chromosome_count)
            obs_exp_matrix_ = convertNansToZeros(
                csr_matrix(obs_exp_matrix_)).todense()
            obs_exp_matrix_ = convertInfsToZeros(
                csr_matrix(obs_exp_matrix_)).todense()

            pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_)
            pearson_correlation_matrix = convertNansToZeros(
                csr_matrix(pearson_correlation_matrix)).todense()
            pearson_correlation_matrix = convertInfsToZeros(
                csr_matrix(pearson_correlation_matrix)).todense()

            corrmatrix = np.cov(pearson_correlation_matrix)
            corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
            corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
            evals, eigs = linalg.eig(corrmatrix)

            chrom, start, end, _ = zip(
                *ma.cut_intervals[chr_range[0]:chr_range[1]])

            chrom_list += chrom
            start_list += start
            end_list += end
            if pExtraTrack and (pExtraTrack.endswith('.bw')
                                or pExtraTrack.endswith('.bigwig')):
                assert (len(end) == len(start))
                correlateEigenvectorWithHistonMarkTrack(
                    eigs[:, :k].transpose(), bwTrack, chrname, start, end,
                    pExtraTrack, pHistonMarkType)

            vecs_list += eigs[:, :k].tolist()
        if compartments_matrix is None:
            compartments_matrix = np.zeros(
                [pXDimension, len(np.array(vecs_list).flatten())],
                dtype=np.float)

        eigenvector = np.real(np.array(vecs_list).flatten())
        mask = np.isnan(eigenvector)
        if len(mask) > 0:
            eigenvector[mask] = 0
        mask = np.isinf(eigenvector)
        if len(mask) > 0:
            eigenvector[mask] = 0

        if pBinarization:
            mask = eigenvector <= 0
            eigenvector[mask] = -1
            mask = eigenvector > 0
            eigenvector[mask] = 1

        compartments_matrix[pIndex + i, :] = eigenvector

    pQueue.put(compartments_matrix)

    return

Example #3

Show file

File: hicAverageRegions.py Project: wangyibin/HiCExplorer

def main(args=None):

    args = parse_arguments().parse_args(args)

    hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix)
    indices_values = []
    with open(args.regions, 'r') as file:
        for line in file.readlines():
            _line = line.strip().split('\t')
            if len(line) == 0:
                continue
            if len(_line) == 2:
                chrom, start = _line[0], _line[1]

                viewpoint = (chrom, start, start)
            elif len(_line) >= 3:
                chrom, start, end = _line[0], _line[1], _line[2]
                viewpoint = (chrom, start, end)
            if args.range:
                start_range_genomic, end_range_genomic, _ = calculateViewpointRange(
                    hic_ma, viewpoint, args.range)
                # min_length, max_length = hic_ma.getBinPos(hic_ma.getChrBinRange(pViewpoint[0])[1] - 1)[1:]
                # if start_range_genomic < min_length:
                #     log.warning('Ignoring {} {} {} because the reference point minus the range {} is smaller than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range))
                #     continue
                # if end_bin > :
                #     log.warning('Ignoring {} {} {} because the reference point plus the range {} is greater than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range))
                #     continue
                start_bin, end_bin = getBinIndices(
                    hic_ma, (chrom, start_range_genomic, end_range_genomic))
            else:
                start_bin, end_bin = calculateViewpointRangeBins(
                    hic_ma, viewpoint, args.rangeInBins)
            # if start_bin < 0:
            #     log.warning('Ignoring {} {} {} because the reference point minus the range {} is smaller than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range))
            #     continue
            # if end_bin > :
            #     log.warning('Ignoring {} {} {} because the reference point plus the range {} is greater than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range))
            #     continue
            indices_values.append([start_bin, end_bin])

    if args.range:
        dimensions_new_matrix = (args.range[0] // hic_ma.getBinSize()) + (
            args.range[1] // hic_ma.getBinSize())
    elif args.rangeInBins:
        dimensions_new_matrix = args.rangeInBins[0] + args.rangeInBins[1]
    # summed_matrix = csr_matrix((dimensions_new_matrix, dimensions_new_matrix), dtype=np.float32)
    summed_matrix = lil_matrix((dimensions_new_matrix, dimensions_new_matrix),
                               dtype=np.float32)

    max_length = hic_ma.matrix.shape[1]
    for start, end in indices_values:
        _start = 0
        _end = summed_matrix.shape[1]
        if start < 0:
            _start = np.absolute(start)
            start = 0
        if end >= max_length:
            _end = end
            end = max_length

        summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end,
                                                                 start:end]

    summed_matrix /= len(indices_values)

    summed_matrix = summed_matrix.tocsr()
    save_npz(args.outFileName, summed_matrix)

Example #4

Show file

File: hicCorrectMatrix.py Project: tw7649116/HiCExplorer

def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(
            args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    if 'correctionMethod' in args:
        if args.correctionMethod == 'ICE':
            row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
            log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
            ma.maskBins(np.flatnonzero(row_sum == 0))
            matrix_shape = ma.matrix.shape
    if 'plotName' in args:
        row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
        log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
        ma.maskBins(np.flatnonzero(row_sum == 0))
        matrix_shape = ma.matrix.shape

    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)
    ma.matrix = ma.matrix.astype(np.float64, copy=True)

    log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype))
    log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype))
    log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype))

    # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices)))
    # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data)))
    # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr)))

    # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False)
    # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    total_filtered_out = set()
    if args.correctionMethod == 'ICE':
        if not args.filterThreshold:
            log.error('min and max filtering thresholds should be set')
            sys.exit(1)
        outlier_regions = filter_by_zscore(ma,
                                           args.filterThreshold[0],
                                           args.filterThreshold[1],
                                           perchr=args.perchr)
        # compute and print some statistics
        pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
        ma.printchrtoremove(outlier_regions,
                            label="Bins that are MAD outliers ({:.2f}%) "
                            "out of {}".format(pct_outlier,
                                               ma.matrix.shape[0]),
                            restore_masked_bins=False)

        assert matrix_shape == ma.matrix.shape
        # mask filtered regions
        ma.maskBins(outlier_regions)
        total_filtered_out = set(outlier_regions)

        if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
            chrom, _, _, coverage = zip(*ma.cut_intervals)

            assert type(coverage[0]) == np.float64

            failed_bins = np.flatnonzero(
                np.array(coverage) < args.sequencedCountCutoff)

            ma.printchrtoremove(failed_bins,
                                label="Bins with low coverage",
                                restore_masked_bins=False)
            ma.maskBins(failed_bins)
            total_filtered_out = set(failed_bins)
            """
            ma.matrix, to_remove = fill_gaps(ma, failed_bins)
            log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
            ma.maskBins(to_remove)
            """

        if args.transCutoff and 0 < args.transCutoff < 100:
            cutoff = float(args.transCutoff) / 100
            # a usual cutoff is 0.05
            ma.truncTrans(high=cutoff)
            pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()

    correction_factors = []
    corrected_matrix = lil_matrix(ma.matrix.shape)
    if args.perchr:
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            if args.correctionMethod == 'ICE':
                _matrix, _corr_factors = iterative_correction(
                    chr_submatrix, args)
                corrected_matrix[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = _matrix
                correction_factors.append(_corr_factors)
            else:
                # Set the kr matrix along with its correction factors vector
                assert (args.correctionMethod == 'KR')
                log.debug("Loading a float sparse matrix for KR balancing")
                kr = kr_balancing(
                    chr_submatrix.shape[0], chr_submatrix.shape[1],
                    chr_submatrix.count_nonzero(),
                    chr_submatrix.indptr.astype(np.int64, copy=False),
                    chr_submatrix.indices.astype(np.int64, copy=False),
                    chr_submatrix.data.astype(np.float64, copy=False))
                kr.computeKR()
                if args.outFileName.endswith('.h5'):
                    corrected_matrix[
                        chr_range[0]:chr_range[1],
                        chr_range[0]:chr_range[1]] = kr.get_normalised_matrix(
                            True)
                # correction_factors.append(np.true_divide(1,
                #                                          kr.get_normalisation_vector(False).todense()))
                correction_factors.append(
                    kr.get_normalisation_vector(False).todense())

        correction_factors = np.concatenate(correction_factors)

    else:
        if args.correctionMethod == 'ICE':
            corrected_matrix, correction_factors = iterative_correction(
                ma.matrix, args)
            ma.setMatrixValues(corrected_matrix)
        else:
            assert (args.correctionMethod == 'KR')
            log.debug("Loading a float sparse matrix for KR balancing")
            kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1],
                              ma.matrix.count_nonzero(),
                              ma.matrix.indptr.astype(np.int64, copy=False),
                              ma.matrix.indices.astype(np.int64, copy=False),
                              ma.matrix.data.astype(np.float64, copy=False))
            log.debug('passed pointers')
            kr.computeKR()
            log.debug('computation done')

            # set it to False since the vector is already normalised
            # with the previous True
            # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense())
            correction_factors = kr.get_normalisation_vector(False).todense()

            if args.outFileName.endswith('.h5'):
                corrected_matrix = kr.get_normalised_matrix(True)

    if args.outFileName.endswith('.h5'):
        ma.setMatrixValues(corrected_matrix)
    # if
    ma.setCorrectionFactors(correction_factors)

    log.debug("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE':

        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(
            after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff),
                            restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)
        ma.maskBins(to_remove)
    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed",
                        restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)

Example #5

Show file

File: test_HiCMatrix.py Project: xjyx/HiCMatrix

def test_restoreMaskedBins():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    # function should directly return if there are no masked_bins
    hic.restoreMaskedBins()

    nt.assert_equal(hic.getMatrix(), matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    # test general use
    # first get some masked bins
    masking_ids = [0, 1]
    hic.maskBins(masking_ids)

    new_matrix = np.array([[0, 0, 2], [0, 0, 1], [0, 0, 0]])

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))

    # and now restore masked bins
    hic.restoreMaskedBins()

    result_matrix = np.array([[np.nan, np.nan, np.nan, np.nan, np.nan],
                              [np.nan, np.nan, np.nan, np.nan, np.nan],
                              [np.nan, np.nan, 0, 0, 2],
                              [np.nan, np.nan, 0, 0, 1],
                              [np.nan, np.nan, 0, 0, 0]])

    nt.assert_equal(hic.getMatrix(), result_matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    row, col = np.triu_indices(5)
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('a', 30, 40, 1), ('b', 40, 50, 1)]
    hic = hm.hiCMatrix()
    hic.nan_bins = []
    matrix = np.array([[0, 10, 5, 3, 0], [0, 0, 15, 5, 1], [0, 0, 0, 7, 3],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]],
                      dtype=np.int32)

    # make the matrix symmetric:
    hic.matrix = csr_matrix(matrix + matrix.T)
    hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals)

    # Add masked bins masked bins
    hic.maskBins([3])

    matrix = hic.matrix.todense()
    test_matrix = np.array(
        [[0, 10, 5, 0], [10, 0, 15, 1], [5, 15, 0, 3], [0, 1, 3, 0]],
        dtype=np.int32)

    nt.assert_equal(matrix, test_matrix)

    cut_int = hic.cut_intervals
    test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                    ('b', 40, 50, 1)]

    nt.assert_equal(cut_int, test_cut_int)

    hic.restoreMaskedBins()

    dense = hic.matrix.todense()
    test_dense = np.array([[0., 10., 5., 0., 0.], [10., 0., 15., 0., 1.],
                           [5., 15., 0., 0., 3.], [0., 0., 0., 0., 0.],
                           [0., 1., 3., 0., 0.]])

    nt.assert_equal(dense, test_dense)

    cut_int = hic.cut_intervals
    test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                    ('a', 30, 40, 1), ('b', 40, 50, 1)]

    nt.assert_equal(cut_int, test_cut_int)

Example #6

Show file

    def set_properties_defaults(self):
        super(HiCMatrixTrack, self).set_properties_defaults()
        region = None
        if self.properties['region'] is not None:
            if self.properties['region'][2] == 1e15:
                region = [str(self.properties['region'][0])]
            elif len(self.properties['region']) == 3:
                start = int(
                    self.properties['region'][1]) - self.properties['depth']
                if start < 0:
                    start = 0
                end = int(
                    self.properties['region'][2]) + self.properties['depth']

                region = [
                    str(self.properties['region'][0]) + ':' + str(start) +
                    '-' + str(end)
                ]
        # try to open with end region + depth to avoid triangle effect in the plot
        # if it fails open it with given end region.
        try:
            self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'],
                                              pChrnameList=region)
        except Exception:
            region = [
                str(self.properties['region'][0]) + ':' + str(start) + '-' +
                str(self.properties['region'][2])
            ]
            self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'],
                                              pChrnameList=region)

        if len(self.hic_ma.matrix.data) == 0:
            raise Exception("Matrix {} is empty".format(
                self.properties['file']))
        if self.properties['show_masked_bins']:
            pass
        else:
            self.hic_ma.maskBins(self.hic_ma.nan_bins)

        # check that the matrix can be log transformed
        if self.properties['transform'] != 'no':
            if self.properties['transform'] == 'log1p':
                if self.hic_ma.matrix.data.min() + 1 <= 0:
                    raise Exception(
                        "\n*ERROR*\nMatrix contains values below - 1.\n"
                        "log1p transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))

            elif self.properties['transform'] in ['-log', 'log']:
                if self.hic_ma.matrix.data.min() < 0:
                    # For values not filled or equal to zero there will be a
                    # mask, they will be replaced by the minimum value after 0.
                    raise Exception(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))

        new_intervals = hicmatrix.utilities.enlarge_bins(
            self.hic_ma.cut_intervals)
        self.hic_ma.interval_trees, self.hic_ma.chrBinBoundaries = \
            self.hic_ma.intervalListToIntervalTree(new_intervals)

        self.hic_ma.cut_intervals = new_intervals
        binsize = self.hic_ma.getBinSize()
        max_depth_in_bins = int(self.properties['depth'] / binsize)

        # work only with the lower matrix
        # and remove all pixels that are beyond
        # 2 * max_depth_in_bis which are not required
        # (this is done by subtracting a second sparse matrix
        # that contains only the lower matrix that wants to be removed.
        limit = 2 * max_depth_in_bins
        self.hic_ma.matrix = scipy.sparse.triu(self.hic_ma.matrix, k=0, format='csr') - \
            scipy.sparse.triu(self.hic_ma.matrix, k=limit, format='csr')
        self.hic_ma.matrix.eliminate_zeros()

        # fill the main diagonal, otherwise it looks
        # not so good. The main diagonal is filled
        # with an array containing the max value found
        # in the matrix
        if sum(self.hic_ma.matrix.diagonal()) == 0:
            self.log.info(
                "Filling main diagonal with max value because it empty and looks bad...\n"
            )
            max_value = self.hic_ma.matrix.data.max()
            main_diagonal = scipy.sparse.dia_matrix(
                ([max_value] * self.hic_ma.matrix.shape[0], [0]),
                shape=self.hic_ma.matrix.shape)
            self.hic_ma.matrix = self.hic_ma.matrix + main_diagonal

        self.norm = None

        self.process_color('colormap',
                           colormap_possible=True,
                           colormap_only=True,
                           default_value_is_colormap=True)

        self.cmap = cm.get_cmap(self.properties['colormap'])
        self.cmap.set_bad('black')

Example #7

Show file

File: hicConvertFormat.py Project: nrkssa/HiCExplorer

def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]

                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                if args.outputFormat in ['homer', 'ginteractions']:
                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()

                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)

Example #8

Show file

def computeInterIntraTADs(pMatrix, pDomainList, pCoolOrH5, pThreadId, pQueue):
    try:

        inter_left_sum_list = []
        inter_right_sum_list = []
        inter_left_densit_list = []
        inter_right_density_list = []
        inter_left_number_of_contacts_list = []
        inter_right_number_of_contacts_list = []
        inter_left_number_of_contacts_nnz_list = []
        inter_right_number_of_contacts_nzz_list = []

        intra_sum_list = []
        intra_number_of_contacts_list = []
        intra_number_of_contacts_nnz_list = []
        intra_density_list = []
        inter_left_intra_ratio_list = []
        inter_right_intra_ratio_list = []
        inter_left_inter_right_intra_ratio_list = []

        rows = []

        chromosome_list = pDomainList
        for i, row in enumerate(chromosome_list):

            if pThreadId is None:
                log.debug('first thread')
                if i == len(chromosome_list) - 1:
                    continue
            elif pThreadId == True:
                log.debug('middle thread')

                if i == 0 or i == len(chromosome_list) - 1:
                    log.debug('i: {}'.format(i))
                    log.debug('len(chromosome_list): {}'.format(
                        len(chromosome_list)))

                    continue
            elif pThreadId == False:
                log.debug('last thread')

                if i == 0:
                    continue

            if i - 1 >= 0:
                chromosom = chromosome_list[i - 1][0]
                start = chromosome_list[i - 1][1]
            else:
                chromosom = chromosome_list[i][0]
                start = chromosome_list[i][1]
            if i + 1 < len(chromosome_list):
                end = chromosome_list[i + 1][2]
            else:
                end = chromosome_list[i][2]
            # midpos = row[1] + ((row[2] - row[1]) / 2)

            if pCoolOrH5:

                # # get intra-TAD data
                hic_matrix = hm.hiCMatrix(pMatrixFile=pMatrix,
                                          pChrnameList=[
                                              str(row[0]) + ':' + str(row[1]) +
                                              '-' + str(row[2])
                                          ])
                matrix = hic_matrix.matrix

                hic_matrix_inter_tad = hm.hiCMatrix(
                    pMatrixFile=pMatrix,
                    pChrnameList=[
                        str(chromosom) + ':' + str(start) + '-' + str(end)
                    ])

                matrix_inter_tad = hic_matrix_inter_tad.matrix

            else:
                # in case of h5 pMatrixTarget is already a HiCMatrix object
                hic_matrix = pMatrix
                hic_matrix_inter_tad = pMatrix
                indices = hic_matrix.getRegionBinRange(str(row[0]), row[1],
                                                       row[2])

                matrix = hic_matrix.matrix[indices[0]:indices[1],
                                           indices[0]:indices[1]]
                matrix_inter_tad = pMatrix.matrix

            # matrix = matrix.flatten()

            # get index position left tad with tad
            left_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                str(chromosom), row[1], row[1])[0]
            if pCoolOrH5:
                outer_left_boundary_index = 0

                outer_right_boundary_index = -1

            else:
                outer_left_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                    str(chromosom), start, end)[0]

                outer_right_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                    str(chromosom), start, end)[1]

            if i + 1 < len(chromosome_list) and not pCoolOrH5:
                # get index position right tad with tad
                right_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                    str(chromosom), row[2], row[2])[0]
            elif i + 1 < len(chromosome_list):
                right_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                    str(chromosom), row[2], row[2])[0]

            if i - 1 >= 0 and i + 1 < len(chromosome_list):
                intertad_left = matrix_inter_tad[
                    outer_left_boundary_index:left_boundary_index,
                    left_boundary_index:right_boundary_index]
                intertad_right = matrix_inter_tad[
                    left_boundary_index:right_boundary_index,
                    right_boundary_index:outer_right_boundary_index]

            elif i - 1 < 0 and i + 1 < len(chromosome_list):
                intertad_right = matrix_inter_tad[
                    left_boundary_index:right_boundary_index,
                    right_boundary_index:outer_right_boundary_index]

            elif i - 1 > 0 and i + 1 >= len(chromosome_list):
                intertad_left = matrix_inter_tad[
                    outer_left_boundary_index:left_boundary_index,
                    left_boundary_index:right_boundary_index]

            inter_left_sum = 0
            inter_right_sum = 0
            inter_left_density = 0
            inter_right_density = 0
            inter_left_number_of_contacts = 0
            inter_right_number_of_contacts = 0
            inter_left_number_of_contacts_nnz = 0
            inter_right_number_of_contacts_nzz = 0

            intra_sum = matrix.sum()
            intra_number_of_contacts = matrix.shape[0] * matrix.shape[1]
            intra_number_of_contacts_nnz = matrix.nnz
            intra_density = intra_number_of_contacts_nnz / intra_number_of_contacts
            # both inter, left and right is available
            if i - 1 >= 0 and i + 1 < len(chromosome_list):
                # intertad_left = intertad_left.flatten()
                # intertad_right = intertad_right.flatten()
                inter_left_sum = intertad_left.sum()
                inter_right_sum = intertad_right.sum()

                inter_left_number_of_contacts = intertad_left.shape[
                    0] * intertad_left.shape[1]
                inter_right_number_of_contacts = intertad_right.shape[
                    0] * intertad_right.shape[1]
                inter_left_number_of_contacts_nnz = intertad_left.nnz
                inter_right_number_of_contacts_nzz = intertad_right.nnz

                inter_left_density = inter_left_number_of_contacts_nnz / inter_left_number_of_contacts
                inter_right_density = inter_right_number_of_contacts_nzz / inter_right_number_of_contacts
                # statistic_left, significance_level_left = ranksums(intertad_left, intertad_left_control)
                # statistic_right, significance_level_right = ranksums(intertad_right, intertad_right_control)
            elif i - 1 < 0 and i + 1 < len(chromosome_list):
                # inter right is available
                # intertad_right = intertad_right.flatten()
                inter_right_sum = intertad_right.sum()
                inter_right_number_of_contacts = intertad_right.shape[
                    0] * intertad_right.shape[1]
                inter_right_number_of_contacts_nzz = intertad_right.nnz
                inter_right_density = inter_right_number_of_contacts_nzz / inter_right_number_of_contacts

                # statistic_right, significance_level_right = ranksums(intertad_right, intertad_right_control)
            elif i - 1 > 0 and i + 1 >= len(chromosome_list):
                # inter left is available

                # intertad_left = intertad_left.flatten()
                inter_left_sum = intertad_left.sum()
                inter_left_number_of_contacts = intertad_left.shape[
                    0] * intertad_left.shape[1]
                inter_left_number_of_contacts_nnz = intertad_left.nnz
                inter_left_density = inter_left_number_of_contacts_nnz / inter_left_number_of_contacts

                # statistic_left, significance_level_left = ranksums(intertad_left, intertad_left_control)

            inter_left_intra_ratio = inter_left_sum / intra_sum
            inter_right_intra_ratio = inter_right_sum / intra_sum
            inter_left_inter_right_intra_ratio = (inter_left_sum +
                                                  inter_right_sum) / intra_sum

            inter_left_sum_list.append(inter_left_sum)
            inter_right_sum_list.append(inter_right_sum)
            inter_left_densit_list.append(inter_left_density)
            inter_right_density_list.append(inter_right_density)
            inter_left_number_of_contacts_list.append(
                inter_left_number_of_contacts)
            inter_right_number_of_contacts_list.append(
                inter_right_number_of_contacts)
            inter_left_number_of_contacts_nnz_list.append(
                inter_left_number_of_contacts_nnz)
            inter_right_number_of_contacts_nzz_list.append(
                inter_right_number_of_contacts_nzz)

            intra_sum_list.append(intra_sum)
            intra_number_of_contacts_list.append(intra_number_of_contacts)
            intra_number_of_contacts_nnz_list.append(
                intra_number_of_contacts_nnz)
            intra_density_list.append(intra_density)
            inter_left_intra_ratio_list.append(inter_left_intra_ratio)
            inter_right_intra_ratio_list.append(inter_right_intra_ratio)
            inter_left_inter_right_intra_ratio_list.append(
                inter_left_inter_right_intra_ratio)

            rows.append(row)
    except Exception as exp:
        pQueue.put('Fail: ' + str(exp) + traceback.format_exc())
        return
    pQueue.put([
        inter_left_sum_list, inter_right_sum_list, inter_left_densit_list,
        inter_right_density_list, inter_left_number_of_contacts_list,
        inter_right_number_of_contacts_list,
        inter_left_number_of_contacts_nnz_list,
        inter_right_number_of_contacts_nzz_list, intra_sum_list,
        intra_number_of_contacts_list, intra_number_of_contacts_nnz_list,
        intra_density_list, inter_left_intra_ratio_list,
        inter_right_intra_ratio_list, inter_left_inter_right_intra_ratio_list,
        rows
    ])

Example #9

Show file

File: test_hicBuildMatrix.py Project: nrkssa/HiCExplorer

def test_build_matrix_cooler_multiple():
    outfile = NamedTemporaryFile(suffix='.cool', delete=False)
    outfile.close()
    qc_folder = mkdtemp(prefix="testQC_")
    args = "-s {} {} --outFileName {} -bs 5000 10000 20000 -b /tmp/test.bam --QCfolder {} --threads 4".format(
        sam_R1, sam_R2, outfile.name, qc_folder).split()
    hicBuildMatrix.main(args)

    test_5000 = hm.hiCMatrix(
        ROOT +
        "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/5000")
    test_10000 = hm.hiCMatrix(
        ROOT +
        "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/10000")
    test_20000 = hm.hiCMatrix(
        ROOT +
        "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/20000")

    new_5000 = hm.hiCMatrix(outfile.name + '::/resolutions/5000')
    new_10000 = hm.hiCMatrix(outfile.name + '::/resolutions/10000')
    new_20000 = hm.hiCMatrix(outfile.name + '::/resolutions/20000')

    nt.assert_equal(test_5000.matrix.data, new_5000.matrix.data)
    nt.assert_equal(test_10000.matrix.data, new_10000.matrix.data)
    nt.assert_equal(test_20000.matrix.data, new_20000.matrix.data)

    # nt.assert_equal(test.cut_intervals, new.cut_intervals)
    nt.assert_equal(len(new_5000.cut_intervals), len(test_5000.cut_intervals))
    nt.assert_equal(len(new_10000.cut_intervals),
                    len(test_10000.cut_intervals))
    nt.assert_equal(len(new_20000.cut_intervals),
                    len(test_20000.cut_intervals))

    cut_interval_new_ = []
    cut_interval_test_ = []
    for x in new_5000.cut_intervals:
        cut_interval_new_.append(x[:3])
    for x in test_5000.cut_intervals:
        cut_interval_test_.append(x[:3])

    nt.assert_equal(cut_interval_new_, cut_interval_test_)

    cut_interval_new_ = []
    cut_interval_test_ = []
    for x in new_10000.cut_intervals:
        cut_interval_new_.append(x[:3])
    for x in test_10000.cut_intervals:
        cut_interval_test_.append(x[:3])

    nt.assert_equal(cut_interval_new_, cut_interval_test_)

    cut_interval_new_ = []
    cut_interval_test_ = []
    for x in new_20000.cut_intervals:
        cut_interval_new_.append(x[:3])
    for x in test_20000.cut_intervals:
        cut_interval_test_.append(x[:3])

    nt.assert_equal(cut_interval_new_, cut_interval_test_)
    # print(set(os.listdir(ROOT + "QC/")))
    assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log")
    assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder))

    os.unlink(outfile.name)
    shutil.rmtree(qc_folder)

Example #10

Show file

File: hicAdjustMatrix.py Project: wangyibin/HiCExplorer

def adjustMatrix(pArgs):
    if pArgs.chromosomes is not None and pArgs.regions is not None:
        log.error('Please specify either --chromosomes or --regions.')
        exit(1)
    hic_matrix = None
    if pArgs.chromosomes:

        if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep':
            chromosomes_list = cooler.Cooler(pArgs.matrix).chromnames
            if pArgs.chromosomes[0] in chromosomes_list:
                hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes)
            else:
                log.error('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, pArgs.chromosomes[0]))
                exit(1)
        else:
            hic_matrix = hm.hiCMatrix(pArgs.matrix)

        chromosomes_list = list(hic_matrix.chrBinBoundaries)
        chromosomes_list_to_operate_on = []
        for chromosome in pArgs.chromosomes:
            if chromosome in chromosomes_list:
                chromosomes_list_to_operate_on.append(chromosome)
            else:
                log.warning('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, chromosome))
        if len(chromosomes_list_to_operate_on) == 0:
            log.error('No valid chromosome given: {}. Available: {}'.format(pArgs.chromosomes, chromosomes_list))
            exit(1)
        if pArgs.action == 'keep':
            hic_matrix.reorderChromosomes(chromosomes_list_to_operate_on)
        elif pArgs.action == 'remove':
            # chromosomes = list(hic_matrix.chrBinBoundaries)
            for chromosome in chromosomes_list:
                if chromosome in chromosomes_list_to_operate_on:
                    chromosomes_list.remove(chromosome)
            hic_matrix.reorderChromosomes(chromosomes_list)
        elif pArgs.action == 'mask':
            hic_matrix.maskChromosomes(chromosomes_list_to_operate_on)
    elif pArgs.regions:
        hic_matrix = hm.hiCMatrix(pArgs.matrix)
        chromosomes_list = list(hic_matrix.chrBinBoundaries)
        genomic_regions = []
        with open(pArgs.regions, 'r') as file:
            for line in file.readlines():
                _line = line.strip().split('\t')
                if len(line) < 3:
                    log.warning("An entry shorter than 3 columns has been found!")
                    continue
                if len(_line) >= 3:
                    chrom, start, end = _line[0], int(_line[1]), int(_line[2])
                    if chrom in chromosomes_list:
                        genomic_regions.append((chrom, start, end))
                    else:
                        log.warning('Chromosome not available in matrix, ignoring regions: {} {}'.format(pArgs.matrix, chrom))
        if len(genomic_regions) == 0:
            log.error('No valid chromosome given. Available: {}'.format(chromosomes_list))
            exit(1)
        # log.debug('genomic_regions {}'.format(genomic_regions))
        matrix_indices_regions = []
        for region in genomic_regions:
            _regionBinRange = hic_matrix.getRegionBinRange(region[0], region[1], region[2])
            if _regionBinRange is not None:
                start, end = _regionBinRange
                matrix_indices_regions.extend(list(range(start, end)))

        # log.debug('matrix_indices_regions {}'.format(matrix_indices_regions))
        if pArgs.action == 'keep':
            values_submatrix = matrix_indices_regions
            instances, features = hic_matrix.matrix.nonzero()
            mask = np.isin(instances, values_submatrix)
            mask = np.logical_not(mask)
            hic_matrix.matrix.data[mask] = 0
            hic_matrix.matrix.eliminate_zeros()
        elif pArgs.action == 'mask':
            hic_matrix.maskBins(matrix_indices_regions)

        elif pArgs.action == 'remove':

            full_matrix_range = np.array(range(0, max(hic_matrix.matrix.shape[0], hic_matrix.matrix.shape[1])))
            matrix_indices_regions = np.array(matrix_indices_regions)
            full_matrix_range[matrix_indices_regions] = -1
            mask = full_matrix_range != -1
            full_matrix_range = full_matrix_range[mask]

            hic_matrix.reorderBins(full_matrix_range)
    elif pArgs.maskBadRegions:
        if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep':
            hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes)
        else:
            hic_matrix = hm.hiCMatrix(pArgs.matrix)

    else:
        log.info('No data to adjust given. Please specify either --chromosomes or --region parameter.')

    return hic_matrix

Example #11

Show file

def main(args=None):
    args = parse_arguments().parse_args(args)
    mpl.rcParams['pdf.fonttype'] = 42

    # read domains file
    domains_df = readDomainBoundaries(args.tadDomains)
    # log.debug('len(domains_df) {}'.format(len(domains_df)))
    domains = domains_df.values.tolist()
    old_chromosome = None

    tads_per_chromosome = []

    for j in range(len(domains)):
        if old_chromosome is None:
            old_chromosome = domains[j][0]
            per_chromosome = []
            per_chromosome.append(domains[j])

        elif old_chromosome == domains[j][0]:
            per_chromosome.append(domains[j])
            continue
        else:
            tads_per_chromosome.append(per_chromosome)
            per_chromosome = []
            per_chromosome.append(domains[j])
            old_chromosome = domains[j][0]
    tads_per_chromosome.append(per_chromosome)

    # read full h5 or only region if cooler
    is_cooler = check_cooler(args.matrix)

    if not is_cooler:
        hic_matrix = hm.hiCMatrix(args.matrix)
    else:
        hic_matrix = args.matrix

    inter_left_sum_list_chromosomes = []
    inter_right_sum_list_chromosomes = []
    inter_left_density_list_chromosomes = []
    inter_right_density_list_chromosomes = []
    inter_left_number_of_contacts_list_chromosomes = []
    inter_right_number_of_contacts_list_chromosomes = []
    inter_left_number_of_contacts_nnz_list_chromosomes = []
    inter_right_number_of_contacts_nzz_list_chromosomes = []

    intra_sum_list_chromosomes = []
    intra_number_of_contacts_list_chromosomes = []
    intra_number_of_contacts_nnz_list_chromosomes = []
    intra_density_list_chromosomes = []
    inter_left_intra_ratio_list_chromosomes = []
    inter_right_intra_ratio_list_chromosomes = []
    inter_left_inter_right_intra_ratio_list_chromosomes = []

    rows_chromosomes = []

    inter_left_sum_list_threads = [[]] * args.threads
    inter_right_sum_list_threads = [[]] * args.threads
    inter_left_density_list_threads = [[]] * args.threads
    inter_right_density_list_threads = [[]] * args.threads
    inter_left_number_of_contacts_list_threads = [[]] * args.threads
    inter_right_number_of_contacts_list_threads = [[]] * args.threads
    inter_left_number_of_contacts_nnz_list_threads = [[]] * args.threads
    inter_right_number_of_contacts_nzz_list_threads = [[]] * args.threads

    intra_sum_list_threads = [[]] * args.threads
    intra_number_of_contacts_list_threads = [[]] * args.threads
    intra_number_of_contacts_nnz_list_threads = [[]] * args.threads
    intra_density_list_threads = [[]] * args.threads
    inter_left_intra_ratio_list_threads = [[]] * args.threads
    inter_right_intra_ratio_list_threads = [[]] * args.threads
    inter_left_inter_right_intra_ratio_list_threads = [[]] * args.threads

    rows_threads = [[]] * args.threads

    threads_save = deepcopy(args.threads)
    for chromosome in tads_per_chromosome:
        # log.debug('tads_per_chromosome {}'.format(chromosome))
        domainsPerThread = len(chromosome) // args.threads
        if domainsPerThread == 0 and len(chromosome) > 0:
            domainsPerThread = 1
            args.threads = 1
        elif domainsPerThread > 0:
            args.threads = threads_save

        all_data_collected = False
        queue = [None] * args.threads
        process = [None] * args.threads
        thread_done = [False] * args.threads
        # None --> first thread, process first element in list, ignore last one
        # True --> middle thread: ignore first and last element in tad processing
        # False --> last thread: ignore first element, process last one
        thread_id = None
        for i in range(args.threads):

            if args.threads == 1:
                domainListThread = chromosome

            elif i == 0:
                domainListThread = chromosome[i * domainsPerThread:(
                    (i + 1) * domainsPerThread) + 1]
                thread_id = None
            elif i < args.threads - 1:
                domainListThread = chromosome[(i * domainsPerThread) -
                                              1:((i + 1) * domainsPerThread) +
                                              1]
                thread_id = True

            else:
                domainListThread = chromosome[(i * domainsPerThread) - 1:]
                thread_id = False

            if args.threads == 1:
                thread_id = ''

            # log.debug('len(domainListThread) {}'.format(len(domainListThread)))
            # log.debug('len(thread_id) {}'.format(thread_id))

            queue[i] = Queue()
            process[i] = Process(
                target=computeInterIntraTADs,
                kwargs=dict(
                    pMatrix=hic_matrix,
                    # pMatrixControl=hic_matrix_control,
                    pDomainList=domainListThread,
                    pCoolOrH5=is_cooler,
                    # pPValue=args.pValue,
                    pThreadId=thread_id,
                    pQueue=queue[i]))

            process[i].start()
        fail_flag = False
        fail_message = ''
        while not all_data_collected:
            for i in range(args.threads):

                if queue[i] is not None and not queue[i].empty():
                    queue_data = queue[i].get()
                    if 'Fail:' in queue_data:
                        fail_flag = True
                        fail_message = queue_data
                    else:
                        inter_left_sum_list_threads[i], \
                            inter_right_sum_list_threads[i], \
                            inter_left_density_list_threads[i], \
                            inter_right_density_list_threads[i], \
                            inter_left_number_of_contacts_list_threads[i], \
                            inter_right_number_of_contacts_list_threads[i], \
                            inter_left_number_of_contacts_nnz_list_threads[i], \
                            inter_right_number_of_contacts_nzz_list_threads[i], \
                            intra_sum_list_threads[i], \
                            intra_number_of_contacts_list_threads[i], \
                            intra_number_of_contacts_nnz_list_threads[i], \
                            intra_density_list_threads[i], \
                            inter_left_intra_ratio_list_threads[i], \
                            inter_right_intra_ratio_list_threads[i], \
                            inter_left_inter_right_intra_ratio_list_threads[i], \
                            rows_threads[i] = queue_data

                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
                # elif queue[i] is None and

            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        if fail_flag:
            log.error(fail_message[6:])
            exit(1)

        inter_left_sum_list_chromosomes.append([
            item for sublist in inter_left_sum_list_threads for item in sublist
        ])
        inter_right_sum_list_chromosomes.append([
            item for sublist in inter_right_sum_list_threads
            for item in sublist
        ])
        inter_left_density_list_chromosomes.append([
            item for sublist in inter_left_density_list_threads
            for item in sublist
        ])
        inter_right_density_list_chromosomes.append([
            item for sublist in inter_right_density_list_threads
            for item in sublist
        ])
        inter_left_number_of_contacts_list_chromosomes.append([
            item for sublist in inter_left_number_of_contacts_list_threads
            for item in sublist
        ])
        inter_right_number_of_contacts_list_chromosomes.append([
            item for sublist in inter_right_number_of_contacts_list_threads
            for item in sublist
        ])
        inter_left_number_of_contacts_nnz_list_chromosomes.append([
            item for sublist in inter_left_number_of_contacts_nnz_list_threads
            for item in sublist
        ])
        inter_right_number_of_contacts_nzz_list_chromosomes.append([
            item for sublist in inter_right_number_of_contacts_nzz_list_threads
            for item in sublist
        ])

        intra_sum_list_chromosomes.append(
            [item for sublist in intra_sum_list_threads for item in sublist])
        intra_number_of_contacts_list_chromosomes.append([
            item for sublist in intra_number_of_contacts_list_threads
            for item in sublist
        ])
        intra_number_of_contacts_nnz_list_chromosomes.append([
            item for sublist in intra_number_of_contacts_nnz_list_threads
            for item in sublist
        ])
        intra_density_list_chromosomes.append([
            item for sublist in intra_density_list_threads for item in sublist
        ])
        inter_left_intra_ratio_list_chromosomes.append([
            item for sublist in inter_left_intra_ratio_list_threads
            for item in sublist
        ])
        inter_right_intra_ratio_list_chromosomes.append([
            item for sublist in inter_right_intra_ratio_list_threads
            for item in sublist
        ])
        inter_left_inter_right_intra_ratio_list_chromosomes.append([
            item for sublist in inter_left_inter_right_intra_ratio_list_threads
            for item in sublist
        ])

        rows_chromosomes.append(
            [item for sublist in rows_threads for item in sublist])

    inter_left_sum_list = [
        item for sublist in inter_left_sum_list_chromosomes for item in sublist
    ]
    inter_right_sum_list = [
        item for sublist in inter_right_sum_list_chromosomes
        for item in sublist
    ]
    inter_left_density_list = [
        item for sublist in inter_left_density_list_chromosomes
        for item in sublist
    ]
    inter_right_density_list = [
        item for sublist in inter_right_density_list_chromosomes
        for item in sublist
    ]
    inter_left_number_of_contacts_list = [
        item for sublist in inter_left_number_of_contacts_list_chromosomes
        for item in sublist
    ]
    inter_right_number_of_contacts_list = [
        item for sublist in inter_right_number_of_contacts_list_chromosomes
        for item in sublist
    ]
    inter_left_number_of_contacts_nnz_list = [
        item for sublist in inter_left_number_of_contacts_nnz_list_chromosomes
        for item in sublist
    ]
    inter_right_number_of_contacts_nzz_list = [
        item for sublist in inter_right_number_of_contacts_nzz_list_chromosomes
        for item in sublist
    ]

    intra_sum_list = [
        item for sublist in intra_sum_list_chromosomes for item in sublist
    ]
    intra_number_of_contacts_list = [
        item for sublist in intra_number_of_contacts_list_chromosomes
        for item in sublist
    ]
    intra_number_of_contacts_nnz_list = [
        item for sublist in intra_number_of_contacts_nnz_list_chromosomes
        for item in sublist
    ]
    intra_density_list = [
        item for sublist in intra_density_list_chromosomes for item in sublist
    ]
    inter_left_intra_ratio_list = [
        item for sublist in inter_left_intra_ratio_list_chromosomes
        for item in sublist
    ]
    inter_right_intra_ratio_list = [
        item for sublist in inter_right_intra_ratio_list_chromosomes
        for item in sublist
    ]
    inter_left_inter_right_intra_ratio_list = [
        item for sublist in inter_left_inter_right_intra_ratio_list_chromosomes
        for item in sublist
    ]

    rows = [item for sublist in rows_chromosomes for item in sublist]

    with open(args.outFileName, 'w') as file:
        header = '# Created with HiCExplorer\'s hicInterIntraTAD version ' + __version__ + '\n'
        header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tinter_left_sum\tinter_right_sum\tinter_left_density\tinter_right_density\tinter_left_number_of_contacts\tinter_right_number_of_contacts\t'  \
            'inter_left_number_of_contacts_nnz\tinter_right_number_of_contacts_nnz\tintra_sum\tintra_number_of_contacts\tintra_number_of_contacts_nnz\tintra_density\tinter_left_intra_ratio\tinter_right_intra_ratio\tinter_left_inter_right_intra_ratio\n'
        file.write(header)
        for i, row in enumerate(rows):
            row_list = list(map(str, row))

            file.write('\t'.join(row_list))

            file.write('\t{}'.format(inter_left_sum_list[i]))
            file.write('\t{}'.format(inter_right_sum_list[i]))
            file.write('\t{}'.format(inter_left_density_list[i]))
            file.write('\t{}'.format(inter_right_density_list[i]))
            file.write('\t{}'.format(inter_left_number_of_contacts_list[i]))
            file.write('\t{}'.format(inter_right_number_of_contacts_list[i]))
            file.write('\t{}'.format(
                inter_left_number_of_contacts_nnz_list[i]))
            file.write('\t{}'.format(
                inter_right_number_of_contacts_nzz_list[i]))
            file.write('\t{}'.format(intra_sum_list[i]))
            file.write('\t{}'.format(intra_number_of_contacts_list[i]))
            file.write('\t{}'.format(intra_number_of_contacts_nnz_list[i]))
            file.write('\t{}'.format(intra_density_list[i]))
            file.write('\t{}'.format(inter_left_intra_ratio_list[i]))
            file.write('\t{}'.format(inter_right_intra_ratio_list[i]))
            file.write('\t{}'.format(
                inter_left_inter_right_intra_ratio_list[i]))

            file.write('\n')

    plt.scatter(inter_left_intra_ratio_list,
                inter_right_intra_ratio_list,
                s=20,
                alpha=0.7)
    plt.xlabel('Inter-left/intra TAD contact ratio', fontsize=args.fontsize)
    plt.ylabel('Inter-right/intra TAD contact ratio', fontsize=args.fontsize)
    plt.tight_layout()
    plt.savefig(args.outFileNameRatioPlot, dpi=args.dpi)
    plt.close()

Example #12

Show file

File: hicAverageRegions.py Project: bgruening/HiCExplorer

def main(args=None):

    args = parse_arguments().parse_args(args)

    hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix)
    indices_values = []

    with open(args.regions, 'r') as file:
        for line in file.readlines():
            _line = line.strip().split('\t')
            if len(line) == 0:
                continue
            if len(_line) == 2:
                chrom, start = _line[0], _line[1]

                viewpoint = (chrom, start, start)
            elif len(_line) >= 3:
                chrom, start, end = _line[0], _line[1], _line[2]
                if args.considerStrandDirection and len(_line) < 6:
                    log.error(
                        'Strand orientation should be considered but file does not contain the 6th column of the bed file containing this information. Exiting!'
                    )
                    exit(1)

                viewpoint = (chrom, start, end)
            if args.range:
                start_range_genomic, end_range_genomic, start_out, end_out = calculateViewpointRange(
                    hic_ma, viewpoint, args.range,
                    args.coordinatesToBinMapping)
                start_bin, end_bin = getBinIndices(
                    hic_ma, (chrom, start_range_genomic, end_range_genomic))
            else:
                start_bin, end_bin, start_out, end_out = calculateViewpointRangeBins(
                    hic_ma, viewpoint, args.rangeInBins,
                    args.coordinatesToBinMapping)
            if args.considerStrandDirection:
                indices_values.append(
                    [start_bin, end_bin, start_out, end_out, _line[5]])

            else:
                indices_values.append(
                    [start_bin, end_bin, start_out, end_out, None])

    if args.range:
        dimensions_new_matrix = (args.range[0] // hic_ma.getBinSize()) + (
            args.range[1] // hic_ma.getBinSize())
    elif args.rangeInBins:
        dimensions_new_matrix = args.rangeInBins[0] + args.rangeInBins[1]

    summed_matrix = lil_matrix((dimensions_new_matrix, dimensions_new_matrix),
                               dtype=np.float32)
    count_matrix = np.zeros(shape=(dimensions_new_matrix,
                                   dimensions_new_matrix))

    # max_length = hic_ma.matrix.shape[1]
    for start, end, start_out, end_out, orientation in indices_values:
        _start = 0
        _end = summed_matrix.shape[1]
        # if start < 0:
        #     _start = np.absolute(start)
        #     start = 0
        # if end >= max_length:
        #     _end = end
        #     end = max_length
        orig_matrix_length = end - start
        if start_out:
            _start = _end - orig_matrix_length
        if end_out:
            _end = start + orig_matrix_length
        submatrix = hic_ma.matrix[start:end, start:end]
        if summed_matrix.shape != submatrix.shape:
            log.warning('Shape of a submatrix does not match. It is ignored.')
            log.warning('Region: {}'.format(hic_ma.getBinPos(start)))
            continue
        count_matrix[_start:_end, _start:_end] += 1

        if orientation is None or orientation == '+':
            summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end,
                                                                     start:end]
        elif orientation == '-':

            summed_matrix[_start:_end,
                          _start:_end] += hic_ma.matrix[start:end, start:end].T
    summed_matrix /= count_matrix
    summed_matrix = np.array(summed_matrix)
    data = summed_matrix[np.nonzero(summed_matrix)]
    row = np.nonzero(summed_matrix)[0]
    col = np.nonzero(summed_matrix)[1]
    summed_matrix = csr_matrix(
        (data, (row, col)),
        shape=(dimensions_new_matrix, dimensions_new_matrix))
    save_npz(args.outFileName, summed_matrix)

Example #13

Show file

File: hicMergeLoops.py Project: wangyibin/HiCExplorer

def mergeLoops(pDataFrame, pLowestResolution, pTupleX, pTupleY):
    hic = hm.hiCMatrix()
    target_regions_intervaltree_x = hic.intervalListToIntervalTree(pTupleX)[0]
    target_regions_intervaltree_y = hic.intervalListToIntervalTree(pTupleY)[0]

    for i, loop in enumerate(pDataFrame.values):
        # neighborhood factor to extent the search range. This allows to consider the smaller bin sizes
        # like they would be bins of the lowest resolution
        neighborhood_factor_x = int(pLowestResolution) - abs(
            int(loop[2]) - int(loop[1]))
        neighborhood_factor_y = int(pLowestResolution) - abs(
            int(loop[5]) - int(loop[4]))

        if loop[0] in target_regions_intervaltree_x:
            x_interval = target_regions_intervaltree_x[loop[0]].overlap(
                loop[1] - neighborhood_factor_x - 1,
                loop[2] + neighborhood_factor_x + 1)
        if loop[3] in target_regions_intervaltree_y:
            y_interval = target_regions_intervaltree_y[loop[0]].overlap(
                loop[4] - neighborhood_factor_y - 1,
                loop[5] + neighborhood_factor_y + 1)

        if len(x_interval) <= 1 or len(y_interval) <= 1:
            continue

        dict_of_interest_x = {}
        list_of_interest = []
        for data in x_interval:
            dict_of_interest_x[data[2]] = [data[0], data[1]]
        for data in y_interval:
            if data[2] in dict_of_interest_x:
                list_of_interest.append(data)

        max_index = 0
        max_distance = 0
        all_id_list = []
        for data in list_of_interest:
            if abs(data[0] - data[1]) > max_distance:
                max_distance = abs(data[0] - data[1])
                max_index = data[2]
            all_id_list.append(data[2])
        for data in x_interval:
            if data[2] == max_index:
                continue
            if data[2] not in all_id_list:
                continue
            target_regions_intervaltree_x[loop[0]].remove(data)

        for data in y_interval:
            if data[2] == max_index:
                continue
            if data[2] not in all_id_list:
                continue
            target_regions_intervaltree_y[loop[0]].remove(data)

    result_list_index = []
    dict_x = {}
    dict_y = {}
    for chromosome_x, chromosome_y in zip(target_regions_intervaltree_x,
                                          target_regions_intervaltree_y):
        target_regions_intervaltree_x[chromosome_x] = sorted(
            target_regions_intervaltree_x[chromosome_x])
        target_regions_intervaltree_y[chromosome_y] = sorted(
            target_regions_intervaltree_y[chromosome_y])

        for x in target_regions_intervaltree_x[chromosome_x]:
            dict_x[x[2]] = (x[0], x[1])
        for y in target_regions_intervaltree_y[chromosome_y]:
            dict_y[y[2]] = (y[0], y[1])
        for x in dict_x:
            if x in dict_y:
                result_list_index.append(x)

        dict_x = None
        dict_x = {}
        dict_y = None
        dict_y = {}
    return result_list_index

Example #14

Show file

File: hicTransform.py Project: Rungetf/HiCExplorer

def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outFileName.endswith('.h5') or args.outFileName.endswith(
            '.cool'):
        log.error('Output filetype not known.')
        log.error('It is: {}'.format(args.outFileName))
        log.error('Accepted is .h5 or .cool')
        exit(1)

    hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix)
    log.info("hic_ma.matrix: {}".format(hic_ma.matrix))
    if args.chromosomes:
        hic_ma.keepOnlyTheseChr(args.chromosomes)

    length_chromosome = 0
    chromosome_count = len(hic_ma.getChrNames())
    for chrname in hic_ma.getChrNames():
        chr_range = hic_ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    trasf_matrix = lil_matrix(hic_ma.matrix.shape)

    if args.method == 'norm':
        trasf_matrix = lil_matrix(hic_ma.matrix.shape)
        # trasf_matrix_pearson = lil_matrix(hic_ma.matrix.shape)
        # trasf_matrix_corr = lil_matrix(hic_ma.matrix.shape)

        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]

            submatrix.astype(float)
            submatrix = _obs_exp_norm(submatrix, length_chromosome,
                                      chromosome_count)

            submatrix = __pearson(submatrix)
            trasf_matrix[chr_range[0]:chr_range[1],
                         chr_range[0]:chr_range[1]] = lil_matrix(submatrix)

        # hic_ma.setMatrix(trasf_matrix.tocsr(), cut_intervals=hic_ma.cut_intervals)
        # hic_ma.save('obs_norm_pearson.'+ args.outFileName, pSymmetric=False, pApplyCorrection=False)

    elif args.method == 'obs_exp':
        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            submatrix.astype(float)
            trasf_matrix[chr_range[0]:chr_range[1],
                         chr_range[0]:chr_range[1]] = lil_matrix(
                             __obs_exp(submatrix, length_chromosome,
                                       chromosome_count))

    elif args.method == 'pearson':
        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            log.debug("shape: {}".format(submatrix.shape))

            submatrix.astype(float)
            log.debug("shape: {}".format(submatrix.shape))

            trasf_matrix[chr_range[0]:chr_range[1],
                         chr_range[0]:chr_range[1]] = lil_matrix(
                             __pearson(submatrix.todense()))

    elif args.method == 'covariance':
        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            log.debug("shape: {}".format(submatrix.shape))

            submatrix.astype(float)
            log.debug("shape: {}".format(submatrix.shape))

            corrmatrix = np.cov(submatrix.todense())
            trasf_matrix[chr_range[0]:chr_range[1],
                         chr_range[0]:chr_range[1]] = lil_matrix(corrmatrix)

    elif args.method == 'all':
        trasf_matrix_obs_exp = lil_matrix(hic_ma.matrix.shape)
        trasf_matrix_pearson = lil_matrix(hic_ma.matrix.shape)
        trasf_matrix_corr = lil_matrix(hic_ma.matrix.shape)

        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]

            submatrix.astype(float)
            submatrix = __obs_exp(submatrix, length_chromosome,
                                  chromosome_count)

            trasf_matrix_obs_exp[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     submatrix)
            submatrix = __pearson(submatrix)

            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     submatrix)
            corrmatrix = np.cov(submatrix)
            trasf_matrix_corr[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]] = lil_matrix(
                                  corrmatrix)

        hic_ma.setMatrix(trasf_matrix_obs_exp.tocsr(),
                         cut_intervals=hic_ma.cut_intervals)

        basename_outFileName = basename(args.outFileName)
        basename_obs_exp = "obs_exp_" + basename_outFileName
        basename_pearson = "pearson_" + basename_outFileName
        basename_covariance = "covariance_" + basename_outFileName
        path = dirname(args.outFileName)
        if path != '':
            path += '/'

        hic_ma.save(path + basename_obs_exp,
                    pSymmetric=False,
                    pApplyCorrection=False)

        hic_ma.setMatrix(trasf_matrix_pearson.tocsr(),
                         cut_intervals=hic_ma.cut_intervals)
        hic_ma.save(path + basename_pearson,
                    pSymmetric=False,
                    pApplyCorrection=False)

        hic_ma.setMatrix(trasf_matrix_corr.tocsr(),
                         cut_intervals=hic_ma.cut_intervals)
        hic_ma.save(path + basename_covariance,
                    pSymmetric=False,
                    pApplyCorrection=False)

    if not args.method == 'all':
        hic_ma.setMatrix(trasf_matrix.tocsr(),
                         cut_intervals=hic_ma.cut_intervals)
        hic_ma.save(args.outFileName, pSymmetric=False, pApplyCorrection=False)

Example #15

Show file

def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(
            args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
    ma.maskBins(np.flatnonzero(row_sum == 0))
    matrix_shape = ma.matrix.shape
    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    outlier_regions = filter_by_zscore(ma,
                                       args.filterThreshold[0],
                                       args.filterThreshold[1],
                                       perchr=args.perchr)
    # compute and print some statistics
    pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
    ma.printchrtoremove(outlier_regions,
                        label="Bins that are MAD outliers ({:.2f}%) "
                        "out of".format(pct_outlier, ma.matrix.shape[0]),
                        restore_masked_bins=False)

    assert matrix_shape == ma.matrix.shape
    # mask filtered regions
    ma.maskBins(outlier_regions)
    total_filtered_out = set(outlier_regions)

    if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
        chrom, _, _, coverage = zip(*ma.cut_intervals)

        assert type(coverage[0]) == np.float64

        failed_bins = np.flatnonzero(
            np.array(coverage) < args.sequencedCountCutoff)

        ma.printchrtoremove(failed_bins,
                            label="Bins with low coverage",
                            restore_masked_bins=False)
        ma.maskBins(failed_bins)
        total_filtered_out = set(failed_bins)
        """
        ma.matrix, to_remove = fill_gaps(ma, failed_bins)
        log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
        ma.maskBins(to_remove)
        """

    if args.transCutoff and 0 < args.transCutoff < 100:
        cutoff = float(args.transCutoff) / 100
        # a usual cutoff is 0.05
        ma.truncTrans(high=cutoff)

    pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    correction_factors = []
    if args.perchr:
        corrected_matrix = lil_matrix(ma.matrix.shape)
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            _matrix, _corr_factors = iterative_correction(chr_submatrix, args)
            corrected_matrix[chr_range[0]:chr_range[1],
                             chr_range[0]:chr_range[1]] = _matrix
            correction_factors.append(_corr_factors)
        correction_factors = np.concatenate(correction_factors)

    else:
        corrected_matrix, correction_factors = iterative_correction(
            ma.matrix, args)

    ma.setMatrixValues(corrected_matrix)
    ma.setCorrectionFactors(correction_factors)
    log.info("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0:
        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(
            after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff),
                            restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)

        ma.maskBins(to_remove)

    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed",
                        restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)

Example #16

Show file

File: hicPCA.py Project: simonbray/HiCExplorer

def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error("Number of output file names and number of eigenvectors"
                  " does not match. Please"
                  "provide the name of each file.\nFiles: {}\nNumber of "
                  "eigenvectors: {}".format(args.outputFileName,
                                            args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        trasf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        trasf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    if args.extraTrack and (args.extraTrack.endswith('.bw')
                            or args.extraTrack.endswith('.bigwig')):
        bwTrack = pyBigWig.open(args.extraTrack, 'r')
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.norm:
            obs_exp_matrix_ = obs_exp_matrix_norm(submatrix)

        else:
            obs_exp_matrix_ = obs_exp_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
        obs_exp_matrix_ = convertNansToZeros(
            csr_matrix(obs_exp_matrix_)).todense()
        obs_exp_matrix_ = convertInfsToZeros(
            csr_matrix(obs_exp_matrix_)).todense()

        if args.obsexpMatrix:
            trasf_matrix_obsexp[chr_range[0]:chr_range[1],
                                chr_range[0]:chr_range[1]] = lil_matrix(
                                    obs_exp_matrix_)

        pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])

        chrom_list += chrom
        start_list += start
        end_list += end
        if args.extraTrack and (args.extraTrack.endswith('.bw')
                                or args.extraTrack.endswith('.bigwig')):
            assert (len(end) == len(start))
            correlateEigenvectorWithHistonMarkTrack(eigs[:, :k].transpose(),
                                                    bwTrack, chrname, start,
                                                    end, args.extraTrack,
                                                    args.histonMarkType)

        vecs_list += eigs[:, :k].tolist()

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.extraTrack and not args.extraTrack.endswith(
            '.bw') and not args.extraTrack.endswith('.bigwig'):
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list,
                                                      args.extraTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error("ERROR: Your version of pyBigWig is not supporting "
                      "numpy: {}".format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list,
                          _start_list,
                          ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)

Example #17

Show file

def main(args=None):

    args = parse_arguments().parse_args(args)

    if args.labels and len(args.matrices) != len(args.labels):
        log.error("The number of labels does not match the number of matrices.")
        exit(0)
    if not args.labels:
        args.labels = map(lambda x: os.path.basename(x), args.matrices)

    num_files = len(args.matrices)
    map(lambda x: os.path.basename(x), args.matrices)
    # initialize results matrix
    results = np.zeros((num_files, num_files), dtype='float')

    rows, cols = np.triu_indices(num_files)
    correlation_opts = {'spearman': spearmanr,
                        'pearson': pearsonr}
    hic_mat_list = []
    max_value = None
    min_value = None
    all_mat = None
    all_nan = []

    for i, matrix in enumerate(args.matrices):
        log.info("loading hic matrix {}\n".format(matrix))

        if (check_cooler(args.matrices[i])) and args.chromosomes is not None and len(args.chromosomes) == 1:
            _mat = hm.hiCMatrix(matrix, pChrnameList=args.chromosomes)
        else:
            _mat = hm.hiCMatrix(matrix)
            if args.chromosomes:
                _mat.keepOnlyTheseChr(args.chromosomes)
            _mat.filterOutInterChrCounts()

        _mat.diagflat(0)
        log.info("restore masked bins {}\n".format(matrix))
        bin_size = _mat.getBinSize()
        all_nan = np.unique(np.concatenate([all_nan, _mat.nan_bins]))

        _mat = triu(_mat.matrix, k=0, format='csr')
        if args.range:
            min_dist, max_dist = args.range.split(":")
            min_dist = int(min_dist)
            max_dist = int(max_dist)
            if max_dist < bin_size:
                log.error("Please specify a max range that is larger than bin size ({})".format(bin_size))
                exit()
            max_depth_in_bins = int(max_dist / bin_size)
            max_dist = int(max_dist) // bin_size
            min_dist = int(min_dist) // bin_size
            # work only with the upper matrix
            # and remove all pixels that are beyond
            # max_depth_in_bis
            # (this is done by subtracting a second sparse matrix
            # that contains only the upper matrix that wants to be removed.
            _mat = triu(_mat, k=0, format='csr') - triu(_mat, k=max_depth_in_bins, format='csr')

            _mat.eliminate_zeros()

            _mat_coo = _mat.tocoo()
            dist = _mat_coo.col - _mat_coo.row
            keep = np.flatnonzero((dist <= max_dist) & (dist >= min_dist))
            _mat_coo.data = _mat_coo.data[keep]
            _mat_coo.row = _mat_coo.row[keep]
            _mat_coo.col = _mat_coo.col[keep]
            _mat = _mat_coo.tocsr()
        else:
            _mat = triu(_mat, k=0, format='csr')

        if args.log1p:
            _mat.data = np.log1p(_mat.data)
        if all_mat is None:
            all_mat = _mat
        else:
            all_mat = all_mat + _mat

        if max_value is None or max_value < _mat.data.max():
            max_value = _mat.data.max()
        if min_value is None or min_value > _mat.data.min():
            min_value = _mat.data.min()

        hic_mat_list.append(_mat)

    # remove nan bins
    rows_keep = cols_keep = np.delete(list(range(all_mat.shape[1])), all_nan)
    all_mat = all_mat[rows_keep, :][:, cols_keep]

    # make large matrix to correlate by
    # using sparse matrix tricks

    big_mat = None
    for mat in hic_mat_list:
        mat = mat[rows_keep, :][:, cols_keep]
        sample_vector = (mat + all_mat).data - all_mat.data
        if big_mat is None:
            big_mat = sample_vector
        else:
            big_mat = np.vstack([big_mat, sample_vector])

    # take the transpose such that columns represent each of the samples
    big_mat = np.ma.masked_invalid(big_mat).T

    grids = gridspec.GridSpec(num_files, num_files)
    grids.update(wspace=0, hspace=0)
    fig = plt.figure(figsize=(2 * num_files, 2 * num_files))
    plt.rcParams['font.size'] = 8.0

    min_value = int(big_mat.min())
    max_value = int(big_mat.max())
    if (min_value % 2 == 0 and max_value % 2 == 0) or \
            (min_value % 1 == 0 and max_value % 2 == 1):
        # make one value odd and the other even
        max_value += 1

    if args.log1p:
        major_locator = FixedLocator(list(range(min_value, max_value, 2)))
        minor_locator = FixedLocator(list(range(min_value, max_value, 1)))

    for index in range(len(rows)):
        row = rows[index]
        col = cols[index]
        if row == col:
            results[row, col] = 1

            # add titles as
            # empty plot in the diagonal
            ax = fig.add_subplot(grids[row, col])
            ax.text(0.6, 0.6, args.labels[row],
                    verticalalignment='center',
                    horizontalalignment='center',
                    fontsize=10, fontweight='bold',
                    transform=ax.transAxes)
            ax.set_axis_off()
            continue

        log.info("comparing {} and {}\n".format(args.matrices[row],
                                                args.matrices[col]))

        # remove cases in which both are zero or one is zero and
        # the other is one
        _mat = big_mat[:, [row, col]]
        _mat = _mat[_mat.sum(axis=1) > 1, :]
        vector1 = _mat[:, 0]
        vector2 = _mat[:, 1]

        results[row, col] = correlation_opts[args.method](vector1, vector2)[0]

        # scatter plots
        ax = fig.add_subplot(grids[row, col])
        if args.log1p:
            ax.xaxis.set_major_locator(major_locator)
            ax.xaxis.set_minor_locator(minor_locator)
            ax.yaxis.set_major_locator(major_locator)
            ax.yaxis.set_minor_locator(minor_locator)

        ax.text(0.2, 0.8, "{}={:.2f}".format(args.method,
                                             results[row, col]),
                horizontalalignment='left',
                transform=ax.transAxes)
        ax.get_yaxis().set_tick_params(
            which='both',
            left='off',
            right='off',
            direction='out')

        ax.get_xaxis().set_tick_params(
            which='both',
            top='off',
            bottom='off',
            direction='out')

        if col != num_files - 1:
            ax.set_yticklabels([])
        else:
            ax.yaxis.tick_right()
            ax.get_yaxis().set_tick_params(
                which='both',
                left='off',
                right='on',
                direction='out')
        if col - row == 1:
            ax.xaxis.tick_bottom()
            ax.get_xaxis().set_tick_params(
                which='both',
                top='off',
                bottom='on',
                direction='out')
        else:
            ax.set_xticklabels([])

        ax.hist2d(vector1, vector2, bins=150, cmin=0.1)
    fig.tight_layout()
    log.info("saving {}".format(args.outFileNameScatter))
    fig.savefig(args.outFileNameScatter, bbox_inches='tight')

    results = results + np.triu(results, 1).T
    plot_correlation(results, args.labels,
                     args.outFileNameHeatmap,
                     args.zMax,
                     args.zMin,
                     args.colorMap,
                     image_format=args.plotFileFormat)

Example #18

Show file

    def __init__(self, *args, **kwargs):
        super(EngineHiCTrack, self).__init__(*args, **kwargs)

        log.debug('FILE {}'.format(self.properties))
        # log.debug('pRegion {}'.format(pRegion))
        region = None
        if self.properties['region'] is not None:
            if self.properties['region'][2] == 1e15:
                region = [str(self.properties['region'][0])]
            elif len(self.properties['region']) == 3:
                start = int(self.properties['region'][1]) - int(
                    self.properties['depth'])
                if start < 0:
                    start = 0
                end = int(self.properties['region'][2]) + int(
                    self.properties['depth'])

                region = [
                    str(self.properties['region'][0]) + ':' + str(start) +
                    '-' + str(end)
                ]

        # initialize matrix as HiCMatrix object with no data
        self.hic_ma = HiCMatrix.hiCMatrix(pMatrixFile=None,
                                          pChrnameList=region)
        # create matrix to fill out data and intervals
        if 'matrix shape' not in self.properties:
            self.properties['matrix shape'] = 1000
        if 'binsize' not in self.properties:
            self.properties['binsize'] = 3000
        if 'intervals start' not in self.properties:
            self.properties['intervals start'] = 0

        self.hic_ma.matrix, self.hic_ma.cut_intervals = \
            self.definematrix(self.properties['matrix shape'], self.properties['binsize'], self.properties['intervals start'], self.properties['chrom'])

        self.hic_ma.interval_trees, self.hic_ma.chrBinBoundaries = \
            self.hic_ma.intervalListToIntervalTree(self.hic_ma.cut_intervals)

        if len(self.hic_ma.matrix.data) == 0:
            self.log.error("Matrix {} is empty".format(
                self.properties['file']))
            exit(1)
        if 'show_masked_bins' in self.properties and self.properties[
                'show_masked_bins'] == 'yes':
            pass
        else:
            self.hic_ma.maskBins(self.hic_ma.nan_bins)

        # check that the matrix can be log transformed
        if 'transform' in self.properties:
            if self.properties['transform'] == 'log1p':
                if self.hic_ma.matrix.data.min() + 1 < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log1p transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

            elif self.properties['transform'] == '-log':
                if self.hic_ma.matrix.data.min() < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log(-1 * <values>) transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

            elif self.properties['transform'] == 'log':
                if self.hic_ma.matrix.data.min() < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

        binsize = self.hic_ma.getBinSize()
        max_depth_in_bins = int(self.properties['depth'] / binsize)

        # work only with the lower matrix
        # and remove all pixels that are beyond
        # 2 * max_depth_in_bis which are not required
        # (this is done by subtracting a second sparse matrix
        # that contains only the lower matrix that wants to be removed.
        limit = 2 * max_depth_in_bins
        self.hic_ma.matrix = scipy.sparse.triu(self.hic_ma.matrix, k=0, format='csr') - \
            scipy.sparse.triu(self.hic_ma.matrix, k=limit, format='csr')
        self.hic_ma.matrix.eliminate_zeros()

        # fill the main diagonal, otherwise it looks
        # not so good. The main diagonal is filled
        # with an array containing the max value found
        # in the matrix
        if sum(self.hic_ma.matrix.diagonal()) == 0:
            self.log.info(
                "Filling main diagonal with max value because it empty and looks bad...\n"
            )
            max_value = self.hic_ma.matrix.data.max()
            main_diagonal = scipy.sparse.dia_matrix(
                ([max_value] * self.hic_ma.matrix.shape[0], [0]),
                shape=self.hic_ma.matrix.shape)
            self.hic_ma.matrix = self.hic_ma.matrix + main_diagonal

        self.plot_inverted = False
        if 'orientation' in self.properties and self.properties[
                'orientation'] == 'inverted':
            self.plot_inverted = True

        self.norm = None

        if 'colormap' not in self.properties:
            self.properties['colormap'] = DEFAULT_MATRIX_COLORMAP
        self.cmap = cm.get_cmap(self.properties['colormap'])
        self.cmap.set_bad('white')
        #self.cmap.set_over('blue')
        self.background = True

Example #19

Show file

def main(args=None):
    args = parse_arguments().parse_args(args)

    viewpointObj = Viewpoint()
    referencePoints, _ = viewpointObj.readReferencePointFile(
        args.referencePoints)

    # compute for each viewpoint the sparsity and consider these as bad with a sparsity less than given.

    referencePointsPerThread = len(referencePoints) // args.threads
    queue = [None] * args.threads
    process = [None] * args.threads
    sparsity = []
    fail_flag = False
    fail_message = ''
    for j, matrix in enumerate(args.matrices):
        sparsity_local = [None] * args.threads
        hic_ma = hm.hiCMatrix(matrix)
        viewpointObj.hicMatrix = hic_ma

        all_data_collected = False
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                referencePointsThread = referencePoints[
                    i * referencePointsPerThread:(i + 1) *
                    referencePointsPerThread]
            else:
                referencePointsThread = referencePoints[
                    i * referencePointsPerThread:]
            if len(referencePointsThread) == 0:
                process[i] = None
                queue[i] = None
                sparsity_local[i] = []
                continue
            else:
                queue[i] = Queue()
                process[i] = Process(
                    target=compute_sparsity,
                    kwargs=dict(pReferencePoints=referencePointsThread,
                                pViewpointObj=viewpointObj,
                                pArgs=args,
                                pQueue=queue[i]))

                process[i].start()
                log.debug('process started {}'.format(i))

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    sparsity_ = queue[i].get()
                    if 'Fail:' in sparsity_:
                        fail_flag = True
                        fail_message = sparsity_[6:]
                    log.debug('process computed: {}'.format(i))
                    sparsity_local[i] = sparsity_
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        del hic_ma
        del viewpointObj.hicMatrix

        # merge sparsity data per matrix from each thread to one list
        if fail_flag:
            log.error(fail_message)
            exit(1)
        sparsity_local = [
            item for sublist in sparsity_local for item in sublist
        ]
        sparsity.append(sparsity_local)

    # sparsity = np.array(sparsity)
    # mask = sparsity == -1.0

    # change sparsity to sparsity values per viewpoint per matrix: viewpoint = [matrix1, ..., matrix_n]
    sparsity = np.array(sparsity).T
    count_accepted = 0
    count_rejected = 0
    count_failure = 0
    with open(args.referencePoints, 'r') as reference_file_input:
        with open(args.outFileName + '_raw_filter', 'w') as output_file_raw:
            output_file_raw.write(
                '# Created with chicQualityControl version {}\n'.format(
                    __version__))
            output_file_raw.write(
                '# A sparsity of -1.0 indicates a faulty reference point e.g. no data for this reference point was in the matrix.\n'
            )
            output_file_raw.write('# Used Matrices ')
            for matrix in args.matrices:
                output_file_raw.write('{}\t'.format(matrix))
            output_file_raw.write('\n# Chromosome\tStart\tEnd')
            for matrix in args.matrices:
                output_file_raw.write('\tSparsity {}'.format(
                    os.path.basename(matrix)))
            output_file_raw.write('\n')

            with open(args.outFileName + '_failed_reference_points',
                      'w') as output_file_failed:
                with open(args.outFileName + '_rejected_filter',
                          'w') as output_file_rejected:
                    with open(args.outFileName, 'w') as output_file:
                        for i, line in enumerate(
                                reference_file_input.readlines()):
                            sparsity_str = '\t'.join(
                                str(x) for x in sparsity[i])
                            output_file_raw.write(line.strip() + '\t' +
                                                  sparsity_str + '\n')
                            count = 0
                            count_negative = 0
                            for j in range(len(sparsity[i])):
                                if sparsity[i][j] == -1.0:
                                    count_negative += 1
                                elif sparsity[i][j] > args.sparsity:
                                    count += 1
                            if count_negative:
                                output_file_failed.write(line)
                                count_failure += 1
                            elif count:
                                output_file.write(line)
                                count_accepted += 1
                            else:
                                output_file_rejected.write(line)
                                count_rejected += 1

    with open(args.outFileName + '_report', 'w') as output_file_report:
        output_file_report.write(
            '# Created with chicQualityControl version {}\n'.format(
                __version__))
        output_file_report.write('# QC report for matrices: ')
        for matrix in args.matrices:
            output_file_report.write(matrix + ' ')
        output_file_report.write('\n')
        output_file_report.write(
            '#Sparsity threshold for rejection: sparsity <= {} are rejected.\n'
            .format(args.sparsity))
        output_file_report.write('\nNumber of reference points: {}\n'.format(
            str(count_accepted + count_rejected + count_failure)))
        output_file_report.write(
            'Number of accepted reference points: {}\n'.format(
                str(count_accepted)))
        output_file_report.write(
            'Number of rejected reference points: {}\n'.format(
                str(count_rejected)))
        output_file_report.write(
            'Number of faulty reference points: {}\n'.format(
                str(count_failure)))
        output_file_report.write(
            '\n\nA faulty reference point is caused by the non-presence of the chromosome in one of the given matrices.\n'
        )
        output_file_report.write(
            'It can also be caused by the non-presence of valid Hi-C reads in a region, especially at the chromosome ends.\n'
        )
        output_file_report.write(
            'Please check the results of hicInfo to validate this for your data.\n'
        )

    # output plot of sparsity distribution per sample
    # remove fault reference points from statistics
    x = [[]] * len(args.matrices)
    y = [[]] * len(args.matrices)

    mask = [True] * len(sparsity)
    for i in range(len(sparsity)):
        delete_instance = False
        for j in range(len(args.matrices)):
            if sparsity[i][j] == -1.0:
                delete_instance = True
        if delete_instance:
            mask[i] = False

    mask = np.array(mask)
    sparsity = sparsity[mask]

    for i in range(len(args.matrices)):
        y[i] = [i] * len(sparsity)

    sparsity = sparsity.T

    for i in range(len(args.matrices)):
        x[i] = sparsity[i].flatten()

    for i in range(len(args.matrices)):
        plt.plot(x[i],
                 y[i],
                 'o',
                 mfc='none',
                 markersize=0.3,
                 label=args.matrices[i].split('/')[-1])
    plt.yticks([])
    plt.xlabel("Sparsity level")

    plt.axvline(x=args.sparsity,
                c='r',
                label='sparsity threshold',
                linewidth=0.3)
    plt.xscale('log')
    ax = plt.gca()
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5))
    plt.savefig(args.outFileNameSparsity, dpi=args.dpi)

    # plt.xlabel("Length of list (number)")
    # plt.ylabel("Time taken (seconds)")
    plt.close()
    for i in range(len(args.matrices)):
        plt.hist(x[i],
                 bins=100,
                 alpha=0.5,
                 label=args.matrices[i].split('/')[-1])
    plt.xlabel("Sparsity level")
    plt.ylabel("Number of counts")
    # plt.legend(loc='upper right')

    ax = plt.gca()
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5))
    plt.savefig(args.outFileNameHistogram, dpi=args.dpi)

Example #20

Show file

File: chicQualityControl.py Project: wangyibin/HiCExplorer

def main(args=None):
    args = parse_arguments().parse_args(args)

    viewpointObj = Viewpoint()
    referencePoints, _ = viewpointObj.readReferencePointFile(
        args.referencePoints)

    # compute for each viewpoint the sparsity and consider these as bad with a sparsity less than given.

    referencePointsPerThread = len(referencePoints) // args.threads
    queue = [None] * args.threads
    process = [None] * args.threads
    sparsity = []

    for j, matrix in enumerate(args.matrices):
        sparsity_local = [None] * args.threads
        hic_ma = hm.hiCMatrix(matrix)
        viewpointObj.hicMatrix = hic_ma

        all_data_collected = False
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                referencePointsThread = referencePoints[
                    i * referencePointsPerThread:(i + 1) *
                    referencePointsPerThread]
            else:
                referencePointsThread = referencePoints[
                    i * referencePointsPerThread:]
            if len(referencePointsThread) == 0:
                process[i] = None
                queue[i] = None
                sparsity_local[i] = []
                continue
            else:
                queue[i] = Queue()
                process[i] = Process(
                    target=compute_sparsity,
                    kwargs=dict(pReferencePoints=referencePointsThread,
                                pViewpointObj=viewpointObj,
                                pArgs=args,
                                pQueue=queue[i]))

                process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    sparsity_ = queue[i].get()
                    sparsity_local[i] = sparsity_
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        del hic_ma
        del viewpointObj.hicMatrix

        # merge sparsity data per matrix from each thread to one list

        sparsity_local = [
            item for sublist in sparsity_local for item in sublist
        ]
        sparsity.append(sparsity_local)

    # change sparsity to sparsity values per viewpoint per matrix: viewpoint = [matrix1, ..., matrix_n]
    sparsity = np.array(sparsity).T

    with open(args.referencePoints, 'r') as reference_file_input:
        with open(args.outFileName + '_raw_filter', 'w') as output_file_raw:
            output_file_raw.write(
                '# Created with chicQualityControl version {}\n'.format(
                    __version__))
            output_file_raw.write('# Chromosome\tStart\tEnd\t')
            for matrix in args.matrices:
                output_file_raw.write('Sparsity {}\t'.format(matrix))
            output_file_raw.write('\n')

            with open(args.outFileName + '_rejected_filter',
                      'w') as output_file_rejected:
                with open(args.outFileName, 'w') as output_file:
                    for i, line in enumerate(reference_file_input.readlines()):
                        sparsity_str = '\t'.join(str(x) for x in sparsity[i])
                        output_file_raw.write(line.strip() + '\t' +
                                              sparsity_str + '\n')
                        count = 0
                        for j in range(len(sparsity[i])):
                            if sparsity[i][j] > args.sparsity:
                                count += 1
                        if count:
                            output_file.write(line)
                        else:
                            output_file_rejected.write(line)
    # output plot of sparsity distribution per sample

    # re-arange values again

    x = [[]] * len(args.matrices)
    y = [[]] * len(args.matrices)

    for i in range(len(args.matrices)):
        y[i] = [i] * len(sparsity)
    sparsity = sparsity.T

    for i in range(len(args.matrices)):
        x[i] = sparsity[i].flatten()

    for i in range(len(args.matrices)):
        plt.plot(x[i],
                 y[i],
                 'o',
                 mfc='none',
                 markersize=0.3,
                 label=args.matrices[i].split('/')[-1])
    plt.yticks([])
    plt.xlabel("Sparsity level")

    plt.axvline(x=args.sparsity,
                c='r',
                label='sparsity threshold',
                linewidth=0.3)
    plt.xscale('log')
    ax = plt.gca()
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5))
    plt.savefig(args.outFileNameSparsity, dpi=args.dpi)

    # plt.xlabel("Length of list (number)")
    # plt.ylabel("Time taken (seconds)")
    plt.close()
    for i in range(len(args.matrices)):
        plt.hist(x[i],
                 bins=100,
                 alpha=0.5,
                 label=args.matrices[i].split('/')[-1])
    plt.xlabel("Sparsity level")
    plt.ylabel("Number of counts")
    # plt.legend(loc='upper right')

    ax = plt.gca()
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5))
    plt.savefig(args.outFileNameHistogram, dpi=args.dpi)

Example #21

Show file

def main(args=None):
    args = parse_arguments().parse_args(args)

    viewpointObj = Viewpoint()

    referencePoints, gene_list = viewpointObj.readReferencePointFile(
        args.referencePoints)
    referencePointsPerThread = len(referencePoints) // args.threads
    queue = [None] * args.threads
    process = [None] * args.threads
    file_list = []
    background_model = viewpointObj.readBackgroundDataFile(
        args.backgroundModelFile, args.range, args.fixateRange)
    background_model_mean_values = viewpointObj.readBackgroundDataFile(
        args.backgroundModelFile, args.range, args.fixateRange, pMean=True)
    # background_sum_of_densities_dict = viewpointObj.computeSumOfDensities(
    #     background_model, args, pXfoldMaxValue=args.xFoldMaxValueNB)

    if not os.path.exists(args.outputFolder):
        try:
            os.makedirs(args.outputFolder)
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    fail_flag = False
    fail_message = ''

    for matrix in args.matrices:
        hic_ma = hm.hiCMatrix(matrix)
        viewpointObj.hicMatrix = hic_ma
        file_list_sample = [None] * args.threads
        all_data_collected = False

        for i in range(args.threads):

            if i < args.threads - 1:
                referencePointsThread = referencePoints[i * referencePointsPerThread:(i + 1) * referencePointsPerThread]
                geneListThread = gene_list[i * referencePointsPerThread:(i + 1) * referencePointsPerThread]
            else:
                referencePointsThread = referencePoints[i * referencePointsPerThread:]
                geneListThread = gene_list[i * referencePointsPerThread:]

            if len(referencePointsThread) == 0:
                process[i] = None
                queue[i] = None
                file_list_sample[i] = []
                continue
            queue[i] = Queue()
            process[i] = Process(target=compute_viewpoint, kwargs=dict(
                pViewpointObj=viewpointObj,
                pArgs=args,
                pQueue=queue[i],
                pReferencePoints=referencePointsThread,
                pGeneList=geneListThread,
                pMatrix=matrix,
                pBackgroundModel=background_model,
                pBackgroundModelRelativeInteractions=background_model_mean_values,
                pOutputFolder=args.outputFolder
            )
            )

            process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    file_list_ = queue[i].get()
                    if 'Fail:' in file_list_:
                        fail_flag = True
                        fail_message = file_list_[6:]
                    file_list_sample[i] = file_list_
                    process[i].join()
                    process[i].terminate()
                    process[i] = None

            all_data_collected = True

            for i in range(args.threads):
                if process[i] is not None:
                    all_data_collected = False
            time.sleep(1)

        if fail_flag:
            log.error(fail_message)
            exit(1)

        file_list_sample = [item for sublist in file_list_sample for item in sublist]
        file_list.append(file_list_sample)

    log.debug('file_list {}'.format(file_list))
    if args.writeFileNamesToFile:
        with open(args.writeFileNamesToFile, 'w') as file:
            log.debug('len(file_list) {}'.format(len(file_list)))
            if len(file_list) > 1:
                for i, sample in enumerate(file_list):
                    for sample2 in file_list[i + 1:]:
                        for viewpoint, viewpoint2 in zip(sample, sample2):
                            file.write(viewpoint + '\n')
                            file.write(viewpoint2 + '\n')
            else:
                for viewpoint in file_list[0]:
                    file.write(viewpoint + '\n')
    if args.allViewpointsList:

        with open(args.writeFileNamesToFile + 'all', 'w') as file:
            if len(file_list) > 1:
                for i, sample in enumerate(file_list[0]):
                    file.write(sample + '\n')
                    for j in range(1, len(file_list)):
                        file.write(file_list[j][i] + '\n')
            else:
                for viewpoint in file_list[0]:
                    file.write(viewpoint + '\n')

Example #22

Show file

File: hicAdjustMatrix.py Project: simonbray/HiCExplorer

def main(args=None):

    args = parse_arguments().parse_args(args)

    if args.chromosomes is not None and args.regions is not None:
        log.error('Please specify either --chromosomes or --regions.')
        exit(1)
    hic_ma = None
    if args.chromosomes:

        if check_cooler(args.matrix) and len(
                args.chromosomes) == 1 and args.action == 'keep':
            hic_ma = hm.hiCMatrix(args.matrix, pChrnameList=args.chromosomes)
        else:
            hic_ma = hm.hiCMatrix(args.matrix)

        if args.action == 'keep':
            hic_ma.reorderChromosomes(args.chromosomes)
        elif args.action == 'remove':
            chromosomes = list(hic_ma.chrBinBoundaries)
            for chromosome in args.chromosomes:
                if chromosome in chromosomes:
                    chromosomes.remove(chromosome)
            hic_ma.reorderChromosomes(chromosomes)
        elif args.action == 'mask':
            hic_ma.maskChromosomes(args.chromosomes)
    elif args.regions:
        hic_ma = hm.hiCMatrix(args.matrix)
        genomic_regions = []
        with open(args.regions, 'r') as file:
            for line in file.readlines():
                _line = line.strip().split('\t')
                if len(line) == 0:
                    continue
                if len(_line) == 3:
                    chrom, start, end = _line[0], _line[1], int(_line[2]) - 1

                genomic_regions.append((chrom, start, end))

        # log.debug('genomic_regions {}'.format(genomic_regions))
        matrix_indices_regions = []
        for region in genomic_regions:
            _regionBinRange = hic_ma.getRegionBinRange(region[0], region[1],
                                                       region[2])
            if _regionBinRange is not None:
                start, end = _regionBinRange
                matrix_indices_regions.extend(list(range(start, end)))

        # log.debug('matrix_indices_regions {}'.format(matrix_indices_regions))
        if args.action == 'keep':
            hic_ma.reorderBins(matrix_indices_regions)
        elif args.action == 'mask':
            hic_ma.maskBins(matrix_indices_regions)

        elif args.action == 'remove':

            full_matrix_range = np.array(
                range(0, max(hic_ma.matrix.shape[0], hic_ma.matrix.shape[1])))
            matrix_indices_regions = np.array(matrix_indices_regions)
            full_matrix_range[matrix_indices_regions] = -1
            mask = full_matrix_range != -1
            full_matrix_range = full_matrix_range[mask]

            hic_ma.reorderBins(full_matrix_range)
    elif args.maskBadRegions:
        if check_cooler(args.matrix) and len(
                args.chromosomes) == 1 and args.action == 'keep':
            hic_ma = hm.hiCMatrix(args.matrix, pChrnameList=args.chromosomes)
        else:
            hic_ma = hm.hiCMatrix(args.matrix)

    else:
        log.info(
            'No data to adjust given. Please specify either --chromosomes or --region parameter.'
        )

    if hic_ma is not None:
        hic_ma.save(args.outFileName)

Example #23

Show file

File: hicInfo.py Project: wangyibin/HiCExplorer

def main(args=None):

    args = parse_arguments().parse_args(args)
    for matrix in args.matrices:
        # if
        generated_by = None
        genome_assembly = None
        statistics = None
        generated_by_cooler_lib = None
        tool_url = None
        matrix_generated_by = None
        matrix_generated_by_url = None
        creation_date = None
        chromosomes = None
        bin_length = None
        size = None
        nchroms = None
        num_non_zero = None
        min_non_zero = None
        max_non_zero = None
        sum_elements = None
        num_nan_bins = None

        if check_cooler(matrix) and args.no_metadata:
            cooler_file = cooler.Cooler(matrix)

            if cooler_file.info is not None:
                # log.debug('cooler_file.info {}'.format(cooler_file.info))
                if 'bin-size' in cooler_file.info:
                    bin_length = cooler_file.info['bin-size']
                if 'nbins' in cooler_file.info:
                    size = cooler_file.info['nbins']
                if 'nchroms' in cooler_file.info:
                    nchroms = cooler_file.info['nchroms']
                if 'chromosomes' in cooler_file.info:
                    chromosomes = cooler_file.info['chromosomes']
                if 'nnz' in cooler_file.info:
                    num_non_zero = cooler_file.info['nnz']
                if 'min-value' in cooler_file.info:
                    min_non_zero = cooler_file.info['min-value']
                if 'max-value' in cooler_file.info:
                    max_non_zero = cooler_file.info['max-value']
                if 'generated-by' in cooler_file.info:
                    generated_by = toString(cooler_file.info['generated-by'])
                if 'genome-assembly' in cooler_file.info:
                    genome_assembly = toString(
                        cooler_file.info['genome-assembly'])
                if 'metadata' in cooler_file.info:
                    if cooler_file.info['metadata'] is not None:
                        if 'statistics' in cooler_file.info['metadata']:
                            statistics = cooler_file.info['metadata'][
                                'statistics']
                if 'generated-by-cooler-lib' in cooler_file.info:
                    generated_by_cooler_lib = toString(
                        cooler_file.info['generated-by-cooler-lib'])
                if 'tool-url' in cooler_file.info:
                    tool_url = toString(cooler_file.info['tool-url'])
                if 'matrix-generated-by' in cooler_file.info:
                    matrix_generated_by = toString(
                        cooler_file.info['matrix-generated-by'])
                if 'matrix-generated-by-url' in cooler_file.info:
                    matrix_generated_by_url = toString(
                        cooler_file.info['matrix-generated-by-url'])
                if 'creation-date' in cooler_file.info:
                    creation_date = cooler_file.info['creation-date']
                if 'sum-elements' in cooler_file.info:
                    sum_elements = cooler_file.info['sum-elements']

        else:
            hic_ma = hm.hiCMatrix(matrix)
            size = hic_ma.matrix.shape[0]
            num_non_zero = hic_ma.matrix.nnz
            sum_elements = hic_ma.matrix.sum() / 2
            bin_length = hic_ma.getBinSize()
            num_nan_bins = len(hic_ma.nan_bins)
            min_non_zero = hic_ma.matrix.data.min()
            max_non_zero = hic_ma.matrix.data.max()

            chromosomes = list(hic_ma.chrBinBoundaries)

        information = StringIO()
        information.write(
            "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n"
            .format(__version__))

        if matrix is not None:
            information.write("File:\t{}\n".format(matrix))
        if creation_date is not None:
            information.write("Date:\t{}\n".format(creation_date))

        if genome_assembly is not None:
            information.write("Genome assembly:\t{}\n".format(genome_assembly))
        if size is not None:
            information.write("Size:\t{:,}\n".format(size))
        if bin_length is not None:
            information.write("Bin_length:\t{}\n".format(bin_length))
        if sum_elements is not None:
            information.write("Sum of matrix:\t{}\n".format(sum_elements))
        if chromosomes is not None:
            information.write("Chromosomes:\t{}\n".format(", ".join(
                toString(chromosomes))))
        if nchroms is not None:
            information.write("Number of chromosomes:\t{}\n".format(nchroms))
        if num_non_zero is not None:
            information.write(
                "Non-zero elements:\t{:,}\n".format(num_non_zero))
        if min_non_zero is not None:
            information.write("Minimum (non zero):\t{}\n".format(min_non_zero))
        if max_non_zero is not None:
            information.write("Maximum:\t{}\n".format(max_non_zero))
        if num_nan_bins is not None:
            information.write("NaN bins:\t{}\n".format(num_nan_bins))

        if check_cooler(matrix):
            information.write(
                'The following columns are available: {}\n'.format(
                    cooler.Cooler(matrix).bins().columns.values))
        if generated_by is not None:
            information.write("\n\nGenerated by:\t{}\n".format(generated_by))

        if generated_by_cooler_lib is not None:
            information.write("Cooler library version:\t{}\n".format(
                generated_by_cooler_lib))
        if tool_url is not None:
            information.write("HiCMatrix url:\t{}\n".format(tool_url))
        if matrix_generated_by is not None:
            information.write("Interaction matrix created with:\t{}\n".format(
                matrix_generated_by))
        if matrix_generated_by_url is not None:
            information.write("URL:\t{}\n".format(matrix_generated_by_url))

        if statistics is not None:
            information.write("\n\nBuild statistics:\n{}\n".format(statistics))

        if args.outFileName:
            with open(args.outFileName, 'w') as file:
                file.write(information.getvalue())
        else:
            print(information.getvalue())

        information.close()

Example #24

Show file

File: hicPlotDistVsCounts.py Project: tw7649116/HiCExplorer

def main(args=None):
    """
    for each distance, compare the
    distribution of two samples,
    report number of cases were they differ
    """

    args = parse_arguments().parse_args(args)
    mean_dict = OrderedDict()
    matrix_sum = {}
    if args.labels is None:
        labels = OrderedDict([(x, os.path.basename(x)) for x in args.matrices])
    else:
        labels = OrderedDict(zip(args.matrices, args.labels))

    chroms = set()
    for matrix_file in args.matrices:
        hic_ma = HiCMatrix.hiCMatrix(matrix_file)
        matrix_sum[matrix_file] = hic_ma.matrix.sum()
        if args.chromosomeExclude is None:
            args.chromosomeExclude = []

        chrtokeep = [x for x in list(hic_ma.interval_trees) if x not in args.chromosomeExclude]
        hic_ma.keepOnlyTheseChr(chrtokeep)

        mean_dict[matrix_file] = compute_distance_mean(hic_ma, maxdepth=args.maxdepth, perchr=args.perchr)
        chroms = chroms.union([k for k in list(mean_dict[matrix_file]) if len(mean_dict[matrix_file][k]) > 1])

    # compute scale factors such that values are comparable
    min_sum = min(matrix_sum.values())
    scale_factor = dict([(matrix_file, float(min_sum) / mat_sum) for matrix_file, mat_sum in matrix_sum.items()])
    log.info("The scale factors used are: {}".format(scale_factor))
    if len(args.matrices) > 1 and args.perchr:
        # in this case, for each chromosome a plot is made that combines the data from the
        # hic matrices
        max_cols = 4
        num_rows = int(np.ceil(float(len(chroms)) / max_cols))
        num_cols = min(len(chroms), max_cols)

    else:
        num_cols = num_rows = 1

    if args.plotsize is None:
        width = 6
        height = 4
    else:
        width, height = args.plotsize
    fig = plt.figure(figsize=(width * num_cols, height * num_rows))

    axs = np.empty((num_rows, num_cols), dtype='object')
    for matrix_file in args.matrices:
        idx = 0
        for chrom, mean_values in mean_dict[matrix_file].items():
            if len(mean_values) <= 1:
                log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom))
                continue
            x, y = zip(*[(k, v) for k, v in mean_values.items() if v > 0])
            if len(x) <= 1:
                log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom))
                continue
            if args.perchr and len(args.matrices) == 1:
                col = 0
                row = 0
            else:
                col = idx % num_cols
                row = idx // num_cols
            if axs[row, col] is None:
                ax = plt.subplot2grid((num_rows, num_cols), (row, col))
                ax.set_xlabel('genomic distance')
                ax.set_ylabel('corrected Hi-C counts')
                try:
                    ax.set_yscale('log')
                    ax.set_xscale('log')
                except ValueError:
                    continue
            else:
                ax = axs[row, col]
            y = np.array(y) * scale_factor[matrix_file]
            if args.perchr and len(args.matrices) > 1:
                label = labels[matrix_file]
                ax.set_title(chrom)
            elif args.perchr:
                label = chrom
            else:
                label = labels[matrix_file]

            ax.plot(x, y, label=label)
            axs[row, col] = ax
            idx += 1
            if args.outFileData is not None:
                x_vals = np.stack(x).T
                y_vals = np.stack(y).T
                table_to_export = pd.DataFrame({'Matrix': labels[matrix_file],
                                                'Chromosome': chrom,
                                                'Distance': x_vals,
                                                'Contacts': y_vals})
                table_to_export.to_csv(args.outFileData, sep='\t')

    for ax in axs.reshape(-1):
        if ax is None:
            continue
        ax.legend(prop={'size': 'small'})
        ax.set_xlim(0, args.maxdepth)
        handles, labels = ax.get_legend_handles_labels()
        lgd = ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))

    plt.tight_layout()
    plt.savefig(args.plotFile.name, bbox_inches='tight', bbox_extra_artists=(lgd,))
    plt.close(fig)

Example #25

Show file

def plotMatrix(matrixinputfile,imageoutputfile, regionindex1, regionindex2, comparematrix, title, bigwig):
        if not checkExtension(matrixinputfile, '.cool'):
            msg = "input matrix must be in cooler format (.cool)"
            raise SystemExit(msg)
        if comparematrix and not checkExtension(comparematrix, ".cool"):
            msg = "if specified, compare matrix must be in cooler format (.cool)"
            raise SystemExit(msg)
        if not imageoutputfile:
            imageoutputfile = matrixinputfile.rstrip('cool') + 'png'
        elif imageoutputfile and not checkExtension(imageoutputfile, ".png"):
            imageoutputfile = os.path.splitext(imageoutputfile)[0] + ".png"
       
        #get the full matrix first to extract the desired region
        ma = hm.hiCMatrix(matrixinputfile)
        cuts = ma.cut_intervals
        chromosome = cuts[0][0]
        maxIndex = len(cuts) - 1
        #check indices and get the region if ok
        if regionindex1 > maxIndex:
            msg = "invalid start region. Allowed is 0 to {0:d} (0 to {1:d})".format(maxIndex, cuts[maxIndex][1])
            raise SystemExit(msg)
        if regionindex2 < regionindex1:
           msg = "region index 2 must be smaller than region index 1"
           raise SystemExit(msg)
        if regionindex2 > maxIndex:
            regionindex2 = maxIndex
            print("region index 2 clamped to max. value {0:d}".format(maxIndex))
        region = str(chromosome) +":"+str(cuts[regionindex1][1])+"-"+ str(cuts[regionindex2][1])
        
        #now get the data for the input matrix, restricted to the desired region
        upperHiCMatrix = hm.hiCMatrix(matrixinputfile ,pChrnameList=[region])
        upperMatrix = triu(upperHiCMatrix.matrix, k=1, format="csr")
        
        #if set, get data from the same region also for the compare matrix
        #there's no compatibility check so far
        lowerHiCMatrix = None
        lowerMatrix = None
        if comparematrix:
            lowerHiCMatrix = hm.hiCMatrix(comparematrix)
            if chromosome not in [row[0] for row in lowerHiCMatrix.cut_intervals]:
                msg = "compare matrix must contain the same chromosome as the input matrix"
                raise SystemExit(msg)
            lowerHiCMatrix = hm.hiCMatrix(comparematrix , pChrnameList=[region])
            lowerMatrix = tril(lowerHiCMatrix.matrix, k=0, format="csr") 

            if lowerMatrix.get_shape() != upperMatrix.get_shape():
                msg = "shapes of input matrix and compare matrix do not match. Check resolutions"
                raise SystemExit(msg)

        #arguments for plotting
        plotArgs = Namespace(bigwig=bigwig, 
                             chromosomeOrder=None, 
                             clearMaskedBins=False, 
                             colorMap='RdYlBu_r', 
                             disable_tight_layout=False, 
                             dpi=300, 
                             flipBigwigSign=False, 
                             log=False, log1p=True, 
                             perChromosome=False, 
                             region=region, 
                             region2=None, 
                             scaleFactorBigwig=1.0, 
                             scoreName=None, 
                             title=title, 
                             vMax=None, vMaxBigwig=None, 
                             vMin=1.0, vMinBigwig=None,
                             matrix = matrixinputfile) 
        
        #following code is largely duplicated from hicPlotMatrix
        #not exactly beautiful, but works for now
        chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = hicPlot.getRegion(plotArgs, upperHiCMatrix)
        

        mixedMatrix = None
        if comparematrix:
            mixedMatrix = np.asarray((lowerMatrix + upperMatrix).todense().astype(float))
        else:
            mixedMatrix = np.asarray(upperHiCMatrix.matrix.todense().astype(float))
        
        #colormap for plotting
        cmap = cm.get_cmap(plotArgs.colorMap) # pylint: disable=no-member
        cmap.set_bad('black')
        
        bigwig_info = None
        if plotArgs.bigwig: # pylint: disable=no-member
            bigwig_info = {'args': plotArgs, 'axis': None, 'axis_colorbar': None, 'nan_bins': upperHiCMatrix.nan_bins}
        norm = None

        if plotArgs.log or plotArgs.log1p: # pylint: disable=no-member
            mask = mixedMatrix == 0
            try:
                mixedMatrix[mask] = np.nanmin(mixedMatrix[mask == False])
            except ValueError:
                log.info('Matrix contains only 0. Set all values to {}'.format(np.finfo(float).tiny))
                mixedMatrix[mask] = np.finfo(float).tiny
            if np.isnan(mixedMatrix).any() or np.isinf(mixedMatrix).any():
                log.debug("any nan {}".format(np.isnan(mixedMatrix).any()))
                log.debug("any inf {}".format(np.isinf(mixedMatrix).any()))
                mask_nan = np.isnan(mixedMatrix)
                mask_inf = np.isinf(mixedMatrix)
                mixedMatrix[mask_nan] = np.nanmin(mixedMatrix[mask_nan == False])
                mixedMatrix[mask_inf] = np.nanmin(mixedMatrix[mask_inf == False])

        log.debug("any nan after remove of nan: {}".format(np.isnan(mixedMatrix).any()))
        log.debug("any inf after remove of inf: {}".format(np.isinf(mixedMatrix).any()))
        if plotArgs.log1p: # pylint: disable=no-member
            mixedMatrix += 1
            norm = LogNorm()
        elif plotArgs.log: # pylint: disable=no-member 
            norm = LogNorm()

        if plotArgs.bigwig: # pylint: disable=no-member
            # increase figure height to accommodate bigwig track
            fig_height = 8.5
        else:
            fig_height = 7
        height = 4.8 / fig_height
        
        fig_width = 8
        width = 5.0 / fig_width
        left_margin = (1.0 - width) * 0.5

        fig = plt.figure(figsize=(fig_width, fig_height), dpi=plotArgs.dpi) # pylint: disable=no-member

        if plotArgs.bigwig: # pylint: disable=no-member
            gs = gridspec.GridSpec(2, 2, height_ratios=[0.90, 0.1], width_ratios=[0.97, 0.03])
            gs.update(hspace=0.05, wspace=0.05)
            ax1 = plt.subplot(gs[0, 0])
            ax2 = plt.subplot(gs[1, 0])
            ax3 = plt.subplot(gs[0, 1])
            bigwig_info['axis'] = ax2
            bigwig_info['axis_colorbar'] = ax3
        else:
            ax1 = None
        
        bottom = 1.3 / fig_height

        position = [left_margin, bottom, width, height]
        hicPlot.plotHeatmap(mixedMatrix, ma.get_chromosome_sizes(), fig, position,
                    plotArgs, cmap, xlabel=chrom, ylabel=chrom2,
                    start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=ax1, pBigwig=bigwig_info)
        plt.savefig(imageoutputfile, dpi=plotArgs.dpi) # pylint: disable=no-member
        plt.close(fig)

Example #26

Show file

File: hicPlotSVL.py Project: wangyibin/HiCExplorer

def main(args=None):

    args = parse_arguments().parse_args(args)
    short_v_long_range = []
    sum_smaller = []
    sum_greater = []
    for matrix in args.matrices:

        is_cooler = check_cooler(matrix)
        if not is_cooler:
            hic_matrix = hm.hiCMatrix(matrix)
        else:
            hic_matrix = matrix
        if args.chromosomes is None:
            # get all chromosomes from cooler file
            if not is_cooler:
                chromosomes_list = list(hic_matrix.chrBinBoundaries)
            else:
                chromosomes_list = cooler.Cooler(matrix).chromnames
        else:
            chromosomes_list = args.chromosomes

        short_v_long_range_matrix_threads = [None] * args.threads
        sum_smaller_threads = [None] * args.threads
        sum_greater_threads = [None] * args.threads

        chromosomesListPerThread = len(chromosomes_list) // args.threads
        all_data_collected = False
        queue = [None] * args.threads
        process = [None] * args.threads
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                chromosomeListThread = chromosomes_list[
                    i * chromosomesListPerThread:(i + 1) *
                    chromosomesListPerThread]
            else:
                chromosomeListThread = chromosomes_list[
                    i * chromosomesListPerThread:]

            queue[i] = Queue()
            process[i] = Process(target=compute_relation_short_long_range,
                                 kwargs=dict(pHiCMatrix=hic_matrix,
                                             pChromosomes=chromosomeListThread,
                                             pDistance=args.distance,
                                             pIsCooler=is_cooler,
                                             pQueue=queue[i]))

            process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    short_v_long_range_matrix_threads[i], sum_smaller_threads[
                        i], sum_greater_threads[i] = queue[i].get()
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        short_v_long_range_matrix = [
            item for sublist in short_v_long_range_matrix_threads
            for item in sublist
        ]
        sum_smaller_matrix = [
            item for sublist in sum_smaller_threads for item in sublist
        ]
        sum_greater_matrix = [
            item for sublist in sum_greater_threads for item in sublist
        ]

        short_v_long_range.append(short_v_long_range_matrix)
        sum_smaller.append(sum_smaller_matrix)
        sum_greater.append(sum_greater_matrix)

    log.debug(short_v_long_range)
    plt.ylabel('Sum short range / long range')
    plt.tick_params(axis='x',
                    which='both',
                    bottom=False,
                    top=False,
                    labelbottom=False)

    box_plot = plt.boxplot(short_v_long_range, patch_artist=True)
    legend_handels_color = []
    for i, patch in enumerate(box_plot['boxes']):
        patch.set_facecolor(args.colorList[i % len(args.colorList)])
        legend_handels_color.append(
            mpatches.Patch(color=args.colorList[i % len(args.colorList)],
                           label=args.matrices[i].split('/')[-1]))
    plt.legend(handles=legend_handels_color)
    plt.savefig(args.plotFileName, dpi=args.dpi)

    if len(args.matrices) > 1:
        p_values = []
        for i, sample in enumerate(short_v_long_range):
            for sample2 in short_v_long_range[i + 1:]:
                statistic, significance_level = ranksums(sample, sample2)
                p_values.append(significance_level)
        log.debug('p_values {}'.format(p_values))
        with open(args.outFileName, 'w') as file:
            header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n'
            header += "# Short range vs long range contacts per chromosome, p-values of each distribution against each other distribution with Wilcoxon rank-sum\n"
            header += '# Short range contacts: <= ' + str(args.distance) + '\n'
            file.write(header)
            counter = 0
            for i, matrix_0 in enumerate(args.matrices):
                for j, matrix_1 in enumerate(args.matrices[i + 1:]):
                    file.write(matrix_0 + '\t' + matrix_1 + '\t' +
                               str(p_values[counter]) + '\n')
                    counter += 1

    with open(args.outFileNameData, 'w') as file:
        header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n'
        header += "# Short range vs long range contacts per chromosome: raw data\n"
        header += '# Short range contacts: <= ' + str(args.distance) + '\n'
        matrices_names = '\t\t\t'.join(args.matrices)
        header += '#\t{}\n'.format(matrices_names)
        header += '# Chromosome\t'
        header += '\t'.join([
            'Ratio', 'Sum <= {}'.format(args.distance), 'Sum > {}'.format(
                args.distance)
        ] * len(args.matrices))
        header += '\n'
        file.write(header)
        counter = 0
        for i, chromosome in enumerate(chromosomes_list):
            file.write('{}\t'.format(chromosome))
            for j, matrix in enumerate(args.matrices):
                if i < len(short_v_long_range[j]):
                    file.write('{}\t{}\t{}\t'.format(short_v_long_range[j][i],
                                                     sum_smaller[j][i],
                                                     sum_greater[j][i]))
                else:
                    file.write('\t')

            file.write('\n')

Example #27

Show file

def main(args=None):
    args = parse_arguments().parse_args(args)

    viewpointObj = Viewpoint()
    referencePoints, _ = viewpointObj.readReferencePointFile(
        args.referencePoints)

    relative_positions = set()
    bin_size = 0

    # - compute for each condition (matrix):
    # - all viewpoints and smooth them: sliding window approach
    # - after smoothing, sum all viewpoints up to one
    # - compute the percentage of each position with respect to the total interaction count
    # for models of all conditions:
    # - compute nbinom parameters

    referencePointsPerThread = len(referencePoints) // args.threads
    queue = [None] * args.threads
    process = [None] * args.threads
    background_model_data = None
    fail_flag = False
    fail_message = ''

    for matrix in args.matrices:
        hic_ma = hm.hiCMatrix(matrix)
        viewpointObj.hicMatrix = hic_ma

        bin_size = hic_ma.getBinSize()
        all_data_collected = False
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                referencePointsThread = referencePoints[i * referencePointsPerThread:(i + 1) * referencePointsPerThread]
            else:
                referencePointsThread = referencePoints[i * referencePointsPerThread:]

            queue[i] = Queue()
            process[i] = Process(target=compute_background, kwargs=dict(
                pReferencePoints=referencePointsThread,
                pViewpointObj=viewpointObj,
                pArgs=args,
                pQueue=queue[i]
            )
            )

            process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    background_data_thread = queue[i].get()
                    if 'Fail:' in background_data_thread:
                        fail_flag = True
                        fail_message = background_data_thread[6:]
                        queue[i] = None
                        process[i].join()
                        process[i].terminate()
                        process[i] = None
                        thread_done[i] = True
                        continue
                    background_model_data_thread, relative_positions_thread = background_data_thread
                    if background_model_data is None:
                        background_model_data = background_model_data_thread
                    else:
                        for relativePosition in background_model_data_thread:
                            if relativePosition in background_model_data:
                                background_model_data[relativePosition].extend(
                                    background_model_data_thread[relativePosition])
                            else:
                                background_model_data[relativePosition] = background_model_data_thread[relativePosition]

                    relative_positions = relative_positions.union(
                        relative_positions_thread)
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        del hic_ma
        del viewpointObj.hicMatrix

    if fail_flag:
        log.error('An error occurred caused by one or many faulty reference points.')
        log.error('Please run chicQualityControl to remove these from your reference point file: {}'.format(args.referencePoints))
        log.error(fail_message)
        exit(1)
    # for models of all conditions:
    # - fit negative binomial for each relative distance
    relative_positions = sorted(relative_positions)
    nbinom_parameters = {}
    max_value = {}
    mean_value = {}
    sum_all_values = 0
    data_of_distribution = None
    for relative_position in relative_positions:

        if args.truncateZeros:
            data_of_distribution = np.array(background_model_data[relative_position])
            mask = data_of_distribution > 0.0
            data_of_distribution = data_of_distribution[mask]
        else:
            data_of_distribution = np.array(background_model_data[relative_position])
        nbinom_parameters[relative_position] = fit_nbinom.fit(data_of_distribution)

        if len(data_of_distribution) > 0:
            max_value[relative_position] = np.max(data_of_distribution)
            average_value = np.average(data_of_distribution)
            mean_value[relative_position] = average_value
            sum_all_values += average_value
        else:
            max_value[relative_position] = 0.0
            average_value = 0.0
            mean_value[relative_position] = 0.0
            sum_all_values += 0.0

    for relative_position in relative_positions:
        mean_value[relative_position] /= sum_all_values
    # write result to file
    with open(args.outFileName, 'w') as file:
        file.write(
            'Relative position\tsize nbinom\tprob nbinom\tmax value\tmean value\n')

        for relative_position in relative_positions:
            relative_position_in_genomic_scale = relative_position * bin_size
            file.write("{}\t{:.12f}\t{:.12f}\t{:.12f}\t{:.12f}\n".format(relative_position_in_genomic_scale, nbinom_parameters[relative_position]['size'],
                                                                         nbinom_parameters[relative_position]['prob'], max_value[relative_position], mean_value[relative_position]))

Example #28

Show file

File: hicNormalize.py Project: wangyibin/HiCExplorer

def main(args=None):

    args = parse_arguments().parse_args(args)
    hic_matrix_list = []
    sum_list = []
    for matrix in args.matrices:
        hic_ma = hm.hiCMatrix(matrix)
        if args.normalize == 'smallest':
            sum_list.append(hic_ma.matrix.sum())
        hic_matrix_list.append(hic_ma)

    if args.normalize == 'norm_range':
        for i, hic_matrix in enumerate(hic_matrix_list):
            hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32)
            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            min_value = np.min(hic_matrix.matrix.data)
            max_value = np.max(hic_matrix.matrix.data)
            min_max_difference = np.float64(max_value - min_value)

            hic_matrix.matrix.data -= min_value
            hic_matrix.matrix.data /= min_max_difference

            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            hic_matrix.matrix.eliminate_zeros()

            hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
    elif args.normalize == 'smallest':
        argmin = np.argmin(sum_list)

        for i, hic_matrix in enumerate(hic_matrix_list):
            hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32)
            if i != argmin:
                mask = np.isnan(hic_matrix.matrix.data)
                hic_matrix.matrix.data[mask] = 0

                mask = np.isinf(hic_matrix.matrix.data)
                hic_matrix.matrix.data[mask] = 0
                adjust_factor = sum_list[i] / sum_list[argmin]
                hic_matrix.matrix.data /= adjust_factor
                mask = np.isnan(hic_matrix.matrix.data)

            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            hic_matrix.matrix.eliminate_zeros()

            hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
    elif args.normalize == 'multiplicative':

        for i, hic_matrix in enumerate(hic_matrix_list):
            hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32)

            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            # adjust_factor = sum_list[i] / sum_list[argmin]
            hic_matrix.matrix.data *= args.multiplicativeValue
            mask = np.isnan(hic_matrix.matrix.data)

            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            hic_matrix.matrix.eliminate_zeros()

            hic_matrix.save(args.outFileName[i], pApplyCorrection=False)

Example #29

Show file

File: test_HiCMatrix.py Project: dmalzl/HiCMatrix

def test_maskBins():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    new_matrix = np.array([[0, 0, 2], [0, 0, 1], [0, 0, 0]])

    masking_ids = [0, 1]
    hic.maskBins(masking_ids)

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(
        sorted(hic.orig_cut_intervals),
        sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(
        sorted(hic.cut_intervals),
        sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('a', (0, 1)), ('b', (1, 3))]))
    nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))

    # direct return if masking_ids is None or has len() == 0, thus no changes to matrix
    masking_ids = None
    hic.maskBins(masking_ids)

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(
        sorted(hic.orig_cut_intervals),
        sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(
        sorted(hic.cut_intervals),
        sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('a', (0, 1)), ('b', (1, 3))]))

    masking_ids = []

    hic.maskBins(masking_ids)

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(
        sorted(hic.orig_cut_intervals),
        sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(
        sorted(hic.cut_intervals),
        sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('a', (0, 1)), ('b', (1, 3))]))

    nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))

Example #30

Show file

File: hicDifferentialTAD.py Project: ryys1122/HiCExplorer

def main(args=None):
    args = parse_arguments().parse_args(args)

    # read domains file
    domains_df = readDomainBoundaries(args.tadDomains)
    # read full h5 or only region if cooler
    is_cooler_target = check_cooler(args.targetMatrix)
    is_cooler_control = check_cooler(args.controlMatrix)

    if is_cooler_target != is_cooler_control:
        log.error('Matrices are not given in the same format!')
        exit(1)
    if not is_cooler_control:
        hic_matrix_target = hm.hiCMatrix(args.targetMatrix)
        hic_matrix_control = hm.hiCMatrix(args.controlMatrix)
    else:
        hic_matrix_target = args.targetMatrix
        hic_matrix_control = args.controlMatrix
    # accepted_H0 = []
    # rejected_H0 = []
    # log.debug('domains_df {}'.format(domains_df))
    domains = domains_df.values.tolist()

    p_values_threads = [None] * args.threads
    accepted_left_inter_threads = [None] * args.threads
    accepted_right_inter_threads = [None] * args.threads
    accepted_intra_threads = [None] * args.threads
    rows_threads = [None] * args.threads

    domainsPerThread = len(domains) // args.threads
    all_data_collected = False
    queue = [None] * args.threads
    process = [None] * args.threads
    thread_done = [False] * args.threads

    # None --> first thread, process first element in list, ignore last one
    # True --> middle thread: ignore first and last element in tad processing
    # False --> last thread: ignore first element, process last one
    thread_id = None
    for i in range(args.threads):

        if i == 0:
            domainListThread = domains[i * domainsPerThread:(
                (i + 1) * domainsPerThread) + 1]
            thread_id = None
        elif i < args.threads - 1:
            domainListThread = domains[(i * domainsPerThread) -
                                       1:((i + 1) * domainsPerThread) + 1]
            thread_id = True

        else:
            domainListThread = domains[(i * domainsPerThread) - 1:]
            thread_id = False

        if args.threads == 1:
            thread_id = ''
        queue[i] = Queue()
        process[i] = Process(target=computeDifferentialTADs,
                             kwargs=dict(pMatrixTarget=hic_matrix_target,
                                         pMatrixControl=hic_matrix_control,
                                         pDomainList=domainListThread,
                                         pCoolOrH5=is_cooler_control,
                                         pPValue=args.pValue,
                                         pThreadId=thread_id,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(args.threads):
            if queue[i] is not None and not queue[i].empty():
                p_values_threads[i], accepted_left_inter_threads[i], \
                    accepted_right_inter_threads[i], \
                    accepted_intra_threads[i], rows_threads[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    # outfile_names = [item for sublist in outfile_names for item in sublist]
    # target_list_name = [
    #     item for sublist in target_list_name for item in sublist]

    p_values_list = [item for sublist in p_values_threads for item in sublist]
    accepted_inter_left = [
        item for sublist in accepted_left_inter_threads for item in sublist
    ]
    accepted_inter_right = [
        item for sublist in accepted_right_inter_threads for item in sublist
    ]
    accepted_intra = [
        item for sublist in accepted_intra_threads for item in sublist
    ]
    rows = [item for sublist in rows_threads for item in sublist]

    p_values_list = np.array(p_values_list)
    accepted_inter_left = np.array(accepted_inter_left)
    accepted_inter_right = np.array(accepted_inter_right)
    accepted_intra = np.array(accepted_intra)
    rows = np.array(rows)

    if args.mode == 'intra-TAD':
        mask = accepted_intra
    elif args.mode == 'left-inter-TAD':
        if args.modeReject == 'all':
            mask = np.logical_and(accepted_inter_left, accepted_intra)
        else:
            mask = np.logical_or(accepted_inter_left, accepted_intra)

    elif args.mode == 'right-inter-TAD':
        if args.modeReject == 'all':
            mask = np.logical_and(accepted_intra, accepted_inter_right)
        else:
            mask = np.logical_or(accepted_intra, accepted_inter_right)

    else:
        if args.modeReject == 'all':
            mask = np.logical_and(accepted_inter_left, accepted_inter_right)
            mask = np.logical_and(mask, accepted_intra)
        else:
            mask = np.logical_or(accepted_inter_left, accepted_inter_right)
            mask = np.logical_or(mask, accepted_intra)

    accepted_H0 = p_values_list[~mask]
    rejected_H0 = p_values_list[mask]
    accepted_rows = rows[~mask]
    rejected_rows = rows[mask]
    with open(args.outFileNamePrefix + '_accepted.diff_tad', 'w') as file:
        header = '# Created with HiCExplorer\'s hicDifferentialTAD version ' + __version__ + '\n'
        header += '# H0 \'regions are equal\' H0 is accepted for all p-value greater the user given p-value threshold; i.e. regions in this file are not considered as differential.\n'
        header += '# Accepted regions with Wilcoxon rank-sum test to p-value: {}  with used mode: {} and modeReject: {} \n'.format(
            args.pValue, args.mode, args.modeReject)
        header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tp-value left-inter-TAD\tp-value right-inter-TAD\tp-value intra-TAD\n'
        file.write(header)
        for i, row in enumerate(accepted_rows):
            row_list = list(map(str, row))
            file.write('\t'.join(row_list))
            file.write('\t')
            pvalue_list = list(map(str, accepted_H0[i]))
            file.write('\t'.join(pvalue_list))

            file.write('\n')
    with open(args.outFileNamePrefix + '_rejected.diff_tad', 'w') as file:
        header = '# Created with HiCExplorer\'s hicDifferentialTAD version ' + __version__ + '\n'
        header += '# H0 \'regions are equal\' H0 is rejected for all p-value smaller or equal the user given p-value threshold; i.e. regions in this file are considered as differential.\n'
        header += '# Rejected regions with Wilcoxon rank-sum test to p-value: {}  with used mode: {} and modeReject: {} \n'.format(
            args.pValue, args.mode, args.modeReject)
        header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tp-value left-inter-TAD\tp-value right-inter-TAD\tp-value intra-TAD\n'

        file.write(header)

        for i, row in enumerate(rejected_rows):
            row_list = list(map(str, row))
            file.write('\t'.join(row_list))
            file.write('\t')
            pvalue_list = list(map(str, rejected_H0[i]))
            file.write('\t'.join(pvalue_list))
            file.write('\n')