def computeDifferentialTADs(pMatrixTarget, pMatrixControl, pDomainList,
                            pCoolOrH5, pPValue, pThreadId, pQueue):
    accepted_inter_left = []
    accepted_inter_right = []
    accepted_intra = []
    p_values_list = []
    rows = []

    for i, row in enumerate(pDomainList):

        if pThreadId is None:
            log.debug('first thread')
            if i == len(pDomainList) - 1:
                continue
        elif pThreadId == True:
            log.debug('middle thread')

            if i == 0 or i == len(pDomainList) - 1:
                continue
        elif pThreadId == False:
            log.debug('last thread')

            if i == 0:
                continue

        if i - 1 >= 0:
            chromosom = pDomainList[i - 1][0]
            start = pDomainList[i - 1][1]
        else:
            chromosom = pDomainList[i][0]
            start = pDomainList[i][1]
        if i + 1 < len(pDomainList):
            end = pDomainList[i + 1][2]
        else:
            end = pDomainList[i][2]
        # midpos = row[1] + ((row[2] - row[1]) / 2)

        if pCoolOrH5:

            # # get intra-TAD data
            hic_matrix_target = hm.hiCMatrix(
                pMatrixFile=pMatrixTarget,
                pChrnameList=[
                    str(row[0]) + ':' + str(row[1]) + '-' + str(row[2])
                ])
            hic_matrix_control = hm.hiCMatrix(
                pMatrixFile=pMatrixControl,
                pChrnameList=[
                    str(row[0]) + ':' + str(row[1]) + '-' + str(row[2])
                ])
            matrix_target = hic_matrix_target.matrix.toarray()
            matrix_control = hic_matrix_control.matrix.toarray()

            hic_matrix_target_inter_tad = hm.hiCMatrix(
                pMatrixFile=pMatrixTarget,
                pChrnameList=[
                    str(chromosom) + ':' + str(start) + '-' + str(end)
                ])
            hic_matrix_control_inter_tad = hm.hiCMatrix(
                pMatrixFile=pMatrixControl,
                pChrnameList=[
                    str(chromosom) + ':' + str(start) + '-' + str(end)
                ])

            matrix_target_inter_tad = hic_matrix_target_inter_tad.matrix
            matrix_control_inter_tad = hic_matrix_control_inter_tad.matrix

        else:
            # in case of h5 pMatrixTarget is already a HiCMatrix object
            hic_matrix_target = pMatrixTarget
            hic_matrix_control = pMatrixControl
            hic_matrix_target_inter_tad = pMatrixTarget
            hic_matrix_control_inter_tad = pMatrixControl
            indices_target = hic_matrix_target.getRegionBinRange(
                str(row[0]), row[1], row[2])
            indices_control = hic_matrix_control.getRegionBinRange(
                str(row[0]), row[1], row[2])

            matrix_target = hic_matrix_target.matrix[
                indices_target[0]:indices_target[1],
                indices_target[0]:indices_target[1]].toarray()
            matrix_control = hic_matrix_control.matrix[
                indices_control[0]:indices_control[1],
                indices_control[0]:indices_control[1]].toarray()
            matrix_target_inter_tad = pMatrixTarget.matrix
            matrix_control_inter_tad = pMatrixControl.matrix

        matrix_target = matrix_target.flatten()
        matrix_control = matrix_control.flatten()
        # tad_midpoint = hic_matrix_target_inter_tad.getRegionBinRange(str(row[0]), midpos, midpos)[0]

        # if i - 1 >= 0:
        # get index position left tad with tad
        left_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
            str(chromosom), row[1], row[1])[0]
        left_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
            str(chromosom), row[1], row[1])[0]
        if pCoolOrH5:
            outer_left_boundary_index_target = 0
            outer_left_boundary_index_control = 0

            outer_right_boundary_index_control = -1
            outer_right_boundary_index_target = -1

        else:
            outer_left_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
                str(chromosom), start, end)[0]
            outer_left_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
                str(chromosom), start, end)[0]

            outer_right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
                str(chromosom), start, end)[1]
            outer_right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
                str(chromosom), start, end)[1]

        if i + 1 < len(pDomainList) and not pCoolOrH5:
            # get index position left tad with tad
            right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
                str(chromosom), row[2], row[2])[0]
            right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
                str(chromosom), row[2], row[2])[0]
        elif i + 1 < len(pDomainList) - 1:
            right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange(
                str(chromosom), row[2], row[2])[0]
            right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange(
                str(chromosom), row[2], row[2])[0]

        if i - 1 >= 0 and i + 1 < len(pDomainList):
            intertad_left_target = matrix_target_inter_tad[
                outer_left_boundary_index_target:left_boundary_index_target,
                left_boundary_index_target:
                right_boundary_index_target].toarray()
            intertad_right_target = matrix_target_inter_tad[
                left_boundary_index_target:right_boundary_index_target,
                right_boundary_index_target:
                outer_right_boundary_index_target].toarray()
            intertad_left_control = matrix_control_inter_tad[
                outer_left_boundary_index_control:left_boundary_index_control,
                left_boundary_index_control:
                right_boundary_index_control].toarray()
            intertad_right_control = matrix_control_inter_tad[
                left_boundary_index_control:right_boundary_index_control,
                right_boundary_index_control:
                outer_right_boundary_index_control].toarray()

        elif i - 1 < 0 and i + 1 < len(pDomainList):
            intertad_right_target = matrix_target_inter_tad[
                left_boundary_index_target:right_boundary_index_target,
                right_boundary_index_target:
                outer_right_boundary_index_target].toarray()
            intertad_right_control = matrix_control_inter_tad[
                left_boundary_index_control:right_boundary_index_control,
                right_boundary_index_control:
                outer_right_boundary_index_control].toarray()

        elif i - 1 > 0 and i + 1 >= len(pDomainList):
            intertad_left_target = matrix_target_inter_tad[
                outer_left_boundary_index_target:left_boundary_index_target,
                left_boundary_index_target:
                right_boundary_index_target].toarray()
            intertad_left_control = matrix_control_inter_tad[
                outer_left_boundary_index_control:left_boundary_index_control,
                left_boundary_index_control:
                right_boundary_index_control].toarray()

        significance_level_left = None
        significance_level_right = None
        statistic_left = None
        statistic_right = None

        if i - 1 >= 0 and i + 1 < len(pDomainList):
            intertad_left_target = intertad_left_target.flatten()
            intertad_left_control = intertad_left_control.flatten()
            intertad_right_target = intertad_right_target.flatten()
            intertad_right_control = intertad_right_control.flatten()

            statistic_left, significance_level_left = ranksums(
                intertad_left_target, intertad_left_control)
            statistic_right, significance_level_right = ranksums(
                intertad_right_target, intertad_right_control)
        elif i - 1 < 0 and i + 1 < len(pDomainList):
            intertad_right_target = intertad_right_target.flatten()
            intertad_right_control = intertad_right_control.flatten()
            statistic_right, significance_level_right = ranksums(
                intertad_right_target, intertad_right_control)
        elif i - 1 > 0 and i + 1 >= len(pDomainList):
            intertad_left_target = intertad_left_target.flatten()
            intertad_left_control = intertad_left_control.flatten()
            log.debug('intertad_left_target {}'.format(intertad_left_target))
            log.debug('intertad_left_control {}'.format(intertad_left_control))

            statistic_left, significance_level_left = ranksums(
                intertad_left_target, intertad_left_control)

        # log.debug('matrix_target {}'.format(matrix_target))
        # log.debug('matrix_control {}'.format(matrix_control))

        statistic, significance_level = ranksums(matrix_target, matrix_control)
        log.debug('statistic {}, significance_level {}'.format(
            statistic, significance_level))
        log.debug('right statistic {}, significance_level {}'.format(
            statistic_right, significance_level_right))
        log.debug('left statistic {}, significance_level {}'.format(
            statistic_left, significance_level_left))

        p_values = []
        if significance_level_left is None or np.isnan(
                significance_level_left):
            accepted_inter_left.append(0)
            p_values.append(np.nan)
        elif significance_level_left <= pPValue:
            accepted_inter_left.append(1)
            p_values.append(significance_level_left)
        else:
            accepted_inter_left.append(0)
            p_values.append(significance_level_left)

        if significance_level_right is None or np.isnan(
                significance_level_right):
            accepted_inter_right.append(0)
            p_values.append(np.nan)
        elif significance_level_right <= pPValue:
            accepted_inter_right.append(1)
            p_values.append(significance_level_right)
        else:
            accepted_inter_right.append(0)
            p_values.append(significance_level_right)

        if significance_level is None or np.isnan(significance_level):
            accepted_intra.append(0)
            p_values.append(np.nan)
        elif significance_level <= pPValue:
            accepted_intra.append(1)
            p_values.append(significance_level)
        else:
            accepted_intra.append(0)
            p_values.append(significance_level)

        p_values_list.append(p_values)

        rows.append(row)
    # hic_matrix_target_inter_tad.save('manipulated_target.cool')
    # hic_matrix_control_inter_tad.save('manipulated_control.cool')
    pQueue.put([
        p_values_list, accepted_inter_left, accepted_inter_right,
        accepted_intra, rows
    ])
def open_and_store_matrix(pMatrixName, pMatricesList, pIndex, pXDimension,
                          pChromosomes, pNorm, pExtraTrack, pHistonMarkType,
                          pBinarization, pQueue):
    compartments_matrix = None

    for i, matrix in enumerate(pMatricesList):

        ma = hm.hiCMatrix(pMatrixName + '::' + matrix)

        # WARNING
        # DO NOT APPLY BIN MASKING, WILL LEAD TO DIFFERENT SIZES OF THE CHROMOSOMES
        # THIS IS CAUSING A FAIL OF THE COMPUTATION
        # ma.maskBins(ma.nan_bins)
        k = 1
        if pChromosomes:
            ma.keepOnlyTheseChr(pChromosomes)

        vecs_list = []
        chrom_list = []
        start_list = []
        end_list = []
        # PCA is computed per chromosome
        length_chromosome = 0
        chromosome_count = len(ma.getChrNames())

        for chrname in ma.getChrNames():
            chr_range = ma.getChrBinRange(chrname)
            length_chromosome += chr_range[1] - chr_range[0]

        if pExtraTrack and (pExtraTrack.endswith('.bw')
                            or pExtraTrack.endswith('.bigwig')):
            bwTrack = pyBigWig.open(pExtraTrack, 'r')

        for chrname in ma.getChrNames():
            chr_range = ma.getChrBinRange(chrname)
            submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                  chr_range[0]:chr_range[1]]
            if pNorm:
                obs_exp_matrix_ = obs_exp_matrix_norm(submatrix)

            else:
                obs_exp_matrix_ = obs_exp_matrix_lieberman(
                    submatrix, length_chromosome, chromosome_count)
            obs_exp_matrix_ = convertNansToZeros(
                csr_matrix(obs_exp_matrix_)).todense()
            obs_exp_matrix_ = convertInfsToZeros(
                csr_matrix(obs_exp_matrix_)).todense()

            pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_)
            pearson_correlation_matrix = convertNansToZeros(
                csr_matrix(pearson_correlation_matrix)).todense()
            pearson_correlation_matrix = convertInfsToZeros(
                csr_matrix(pearson_correlation_matrix)).todense()

            corrmatrix = np.cov(pearson_correlation_matrix)
            corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
            corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
            evals, eigs = linalg.eig(corrmatrix)

            chrom, start, end, _ = zip(
                *ma.cut_intervals[chr_range[0]:chr_range[1]])

            chrom_list += chrom
            start_list += start
            end_list += end
            if pExtraTrack and (pExtraTrack.endswith('.bw')
                                or pExtraTrack.endswith('.bigwig')):
                assert (len(end) == len(start))
                correlateEigenvectorWithHistonMarkTrack(
                    eigs[:, :k].transpose(), bwTrack, chrname, start, end,
                    pExtraTrack, pHistonMarkType)

            vecs_list += eigs[:, :k].tolist()
        if compartments_matrix is None:
            compartments_matrix = np.zeros(
                [pXDimension, len(np.array(vecs_list).flatten())],
                dtype=np.float)

        eigenvector = np.real(np.array(vecs_list).flatten())
        mask = np.isnan(eigenvector)
        if len(mask) > 0:
            eigenvector[mask] = 0
        mask = np.isinf(eigenvector)
        if len(mask) > 0:
            eigenvector[mask] = 0

        if pBinarization:
            mask = eigenvector <= 0
            eigenvector[mask] = -1
            mask = eigenvector > 0
            eigenvector[mask] = 1

        compartments_matrix[pIndex + i, :] = eigenvector

    pQueue.put(compartments_matrix)

    return
def main(args=None):

    args = parse_arguments().parse_args(args)

    hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix)
    indices_values = []
    with open(args.regions, 'r') as file:
        for line in file.readlines():
            _line = line.strip().split('\t')
            if len(line) == 0:
                continue
            if len(_line) == 2:
                chrom, start = _line[0], _line[1]

                viewpoint = (chrom, start, start)
            elif len(_line) >= 3:
                chrom, start, end = _line[0], _line[1], _line[2]
                viewpoint = (chrom, start, end)
            if args.range:
                start_range_genomic, end_range_genomic, _ = calculateViewpointRange(
                    hic_ma, viewpoint, args.range)
                # min_length, max_length = hic_ma.getBinPos(hic_ma.getChrBinRange(pViewpoint[0])[1] - 1)[1:]
                # if start_range_genomic < min_length:
                #     log.warning('Ignoring {} {} {} because the reference point minus the range {} is smaller than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range))
                #     continue
                # if end_bin > :
                #     log.warning('Ignoring {} {} {} because the reference point plus the range {} is greater than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range))
                #     continue
                start_bin, end_bin = getBinIndices(
                    hic_ma, (chrom, start_range_genomic, end_range_genomic))
            else:
                start_bin, end_bin = calculateViewpointRangeBins(
                    hic_ma, viewpoint, args.rangeInBins)
            # if start_bin < 0:
            #     log.warning('Ignoring {} {} {} because the reference point minus the range {} is smaller than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range))
            #     continue
            # if end_bin > :
            #     log.warning('Ignoring {} {} {} because the reference point plus the range {} is greater than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range))
            #     continue
            indices_values.append([start_bin, end_bin])

    if args.range:
        dimensions_new_matrix = (args.range[0] // hic_ma.getBinSize()) + (
            args.range[1] // hic_ma.getBinSize())
    elif args.rangeInBins:
        dimensions_new_matrix = args.rangeInBins[0] + args.rangeInBins[1]
    # summed_matrix = csr_matrix((dimensions_new_matrix, dimensions_new_matrix), dtype=np.float32)
    summed_matrix = lil_matrix((dimensions_new_matrix, dimensions_new_matrix),
                               dtype=np.float32)

    max_length = hic_ma.matrix.shape[1]
    for start, end in indices_values:
        _start = 0
        _end = summed_matrix.shape[1]
        if start < 0:
            _start = np.absolute(start)
            start = 0
        if end >= max_length:
            _end = end
            end = max_length

        summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end,
                                                                 start:end]

    summed_matrix /= len(indices_values)

    summed_matrix = summed_matrix.tocsr()
    save_npz(args.outFileName, summed_matrix)
def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(
            args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    if 'correctionMethod' in args:
        if args.correctionMethod == 'ICE':
            row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
            log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
            ma.maskBins(np.flatnonzero(row_sum == 0))
            matrix_shape = ma.matrix.shape
    if 'plotName' in args:
        row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
        log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
        ma.maskBins(np.flatnonzero(row_sum == 0))
        matrix_shape = ma.matrix.shape

    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)
    ma.matrix = ma.matrix.astype(np.float64, copy=True)

    log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype))
    log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype))
    log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype))

    # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices)))
    # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data)))
    # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr)))

    # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False)
    # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    total_filtered_out = set()
    if args.correctionMethod == 'ICE':
        if not args.filterThreshold:
            log.error('min and max filtering thresholds should be set')
            sys.exit(1)
        outlier_regions = filter_by_zscore(ma,
                                           args.filterThreshold[0],
                                           args.filterThreshold[1],
                                           perchr=args.perchr)
        # compute and print some statistics
        pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
        ma.printchrtoremove(outlier_regions,
                            label="Bins that are MAD outliers ({:.2f}%) "
                            "out of {}".format(pct_outlier,
                                               ma.matrix.shape[0]),
                            restore_masked_bins=False)

        assert matrix_shape == ma.matrix.shape
        # mask filtered regions
        ma.maskBins(outlier_regions)
        total_filtered_out = set(outlier_regions)

        if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
            chrom, _, _, coverage = zip(*ma.cut_intervals)

            assert type(coverage[0]) == np.float64

            failed_bins = np.flatnonzero(
                np.array(coverage) < args.sequencedCountCutoff)

            ma.printchrtoremove(failed_bins,
                                label="Bins with low coverage",
                                restore_masked_bins=False)
            ma.maskBins(failed_bins)
            total_filtered_out = set(failed_bins)
            """
            ma.matrix, to_remove = fill_gaps(ma, failed_bins)
            log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
            ma.maskBins(to_remove)
            """

        if args.transCutoff and 0 < args.transCutoff < 100:
            cutoff = float(args.transCutoff) / 100
            # a usual cutoff is 0.05
            ma.truncTrans(high=cutoff)
            pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()

    correction_factors = []
    corrected_matrix = lil_matrix(ma.matrix.shape)
    if args.perchr:
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            if args.correctionMethod == 'ICE':
                _matrix, _corr_factors = iterative_correction(
                    chr_submatrix, args)
                corrected_matrix[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = _matrix
                correction_factors.append(_corr_factors)
            else:
                # Set the kr matrix along with its correction factors vector
                assert (args.correctionMethod == 'KR')
                log.debug("Loading a float sparse matrix for KR balancing")
                kr = kr_balancing(
                    chr_submatrix.shape[0], chr_submatrix.shape[1],
                    chr_submatrix.count_nonzero(),
                    chr_submatrix.indptr.astype(np.int64, copy=False),
                    chr_submatrix.indices.astype(np.int64, copy=False),
                    chr_submatrix.data.astype(np.float64, copy=False))
                kr.computeKR()
                if args.outFileName.endswith('.h5'):
                    corrected_matrix[
                        chr_range[0]:chr_range[1],
                        chr_range[0]:chr_range[1]] = kr.get_normalised_matrix(
                            True)
                # correction_factors.append(np.true_divide(1,
                #                                          kr.get_normalisation_vector(False).todense()))
                correction_factors.append(
                    kr.get_normalisation_vector(False).todense())

        correction_factors = np.concatenate(correction_factors)

    else:
        if args.correctionMethod == 'ICE':
            corrected_matrix, correction_factors = iterative_correction(
                ma.matrix, args)
            ma.setMatrixValues(corrected_matrix)
        else:
            assert (args.correctionMethod == 'KR')
            log.debug("Loading a float sparse matrix for KR balancing")
            kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1],
                              ma.matrix.count_nonzero(),
                              ma.matrix.indptr.astype(np.int64, copy=False),
                              ma.matrix.indices.astype(np.int64, copy=False),
                              ma.matrix.data.astype(np.float64, copy=False))
            log.debug('passed pointers')
            kr.computeKR()
            log.debug('computation done')

            # set it to False since the vector is already normalised
            # with the previous True
            # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense())
            correction_factors = kr.get_normalisation_vector(False).todense()

            if args.outFileName.endswith('.h5'):
                corrected_matrix = kr.get_normalised_matrix(True)

    if args.outFileName.endswith('.h5'):
        ma.setMatrixValues(corrected_matrix)
    # if
    ma.setCorrectionFactors(correction_factors)

    log.debug("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE':

        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(
            after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff),
                            restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)
        ma.maskBins(to_remove)
    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed",
                        restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)
Example #5
0
def test_restoreMaskedBins():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    # function should directly return if there are no masked_bins
    hic.restoreMaskedBins()

    nt.assert_equal(hic.getMatrix(), matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    # test general use
    # first get some masked bins
    masking_ids = [0, 1]
    hic.maskBins(masking_ids)

    new_matrix = np.array([[0, 0, 2], [0, 0, 1], [0, 0, 0]])

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))

    # and now restore masked bins
    hic.restoreMaskedBins()

    result_matrix = np.array([[np.nan, np.nan, np.nan, np.nan, np.nan],
                              [np.nan, np.nan, np.nan, np.nan, np.nan],
                              [np.nan, np.nan, 0, 0, 2],
                              [np.nan, np.nan, 0, 0, 1],
                              [np.nan, np.nan, 0, 0, 0]])

    nt.assert_equal(hic.getMatrix(), result_matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    row, col = np.triu_indices(5)
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('a', 30, 40, 1), ('b', 40, 50, 1)]
    hic = hm.hiCMatrix()
    hic.nan_bins = []
    matrix = np.array([[0, 10, 5, 3, 0], [0, 0, 15, 5, 1], [0, 0, 0, 7, 3],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]],
                      dtype=np.int32)

    # make the matrix symmetric:
    hic.matrix = csr_matrix(matrix + matrix.T)
    hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals)

    # Add masked bins masked bins
    hic.maskBins([3])

    matrix = hic.matrix.todense()
    test_matrix = np.array(
        [[0, 10, 5, 0], [10, 0, 15, 1], [5, 15, 0, 3], [0, 1, 3, 0]],
        dtype=np.int32)

    nt.assert_equal(matrix, test_matrix)

    cut_int = hic.cut_intervals
    test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                    ('b', 40, 50, 1)]

    nt.assert_equal(cut_int, test_cut_int)

    hic.restoreMaskedBins()

    dense = hic.matrix.todense()
    test_dense = np.array([[0., 10., 5., 0., 0.], [10., 0., 15., 0., 1.],
                           [5., 15., 0., 0., 3.], [0., 0., 0., 0., 0.],
                           [0., 1., 3., 0., 0.]])

    nt.assert_equal(dense, test_dense)

    cut_int = hic.cut_intervals
    test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                    ('a', 30, 40, 1), ('b', 40, 50, 1)]

    nt.assert_equal(cut_int, test_cut_int)
Example #6
0
    def set_properties_defaults(self):
        super(HiCMatrixTrack, self).set_properties_defaults()
        region = None
        if self.properties['region'] is not None:
            if self.properties['region'][2] == 1e15:
                region = [str(self.properties['region'][0])]
            elif len(self.properties['region']) == 3:
                start = int(
                    self.properties['region'][1]) - self.properties['depth']
                if start < 0:
                    start = 0
                end = int(
                    self.properties['region'][2]) + self.properties['depth']

                region = [
                    str(self.properties['region'][0]) + ':' + str(start) +
                    '-' + str(end)
                ]
        # try to open with end region + depth to avoid triangle effect in the plot
        # if it fails open it with given end region.
        try:
            self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'],
                                              pChrnameList=region)
        except Exception:
            region = [
                str(self.properties['region'][0]) + ':' + str(start) + '-' +
                str(self.properties['region'][2])
            ]
            self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'],
                                              pChrnameList=region)

        if len(self.hic_ma.matrix.data) == 0:
            raise Exception("Matrix {} is empty".format(
                self.properties['file']))
        if self.properties['show_masked_bins']:
            pass
        else:
            self.hic_ma.maskBins(self.hic_ma.nan_bins)

        # check that the matrix can be log transformed
        if self.properties['transform'] != 'no':
            if self.properties['transform'] == 'log1p':
                if self.hic_ma.matrix.data.min() + 1 <= 0:
                    raise Exception(
                        "\n*ERROR*\nMatrix contains values below - 1.\n"
                        "log1p transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))

            elif self.properties['transform'] in ['-log', 'log']:
                if self.hic_ma.matrix.data.min() < 0:
                    # For values not filled or equal to zero there will be a
                    # mask, they will be replaced by the minimum value after 0.
                    raise Exception(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))

        new_intervals = hicmatrix.utilities.enlarge_bins(
            self.hic_ma.cut_intervals)
        self.hic_ma.interval_trees, self.hic_ma.chrBinBoundaries = \
            self.hic_ma.intervalListToIntervalTree(new_intervals)

        self.hic_ma.cut_intervals = new_intervals
        binsize = self.hic_ma.getBinSize()
        max_depth_in_bins = int(self.properties['depth'] / binsize)

        # work only with the lower matrix
        # and remove all pixels that are beyond
        # 2 * max_depth_in_bis which are not required
        # (this is done by subtracting a second sparse matrix
        # that contains only the lower matrix that wants to be removed.
        limit = 2 * max_depth_in_bins
        self.hic_ma.matrix = scipy.sparse.triu(self.hic_ma.matrix, k=0, format='csr') - \
            scipy.sparse.triu(self.hic_ma.matrix, k=limit, format='csr')
        self.hic_ma.matrix.eliminate_zeros()

        # fill the main diagonal, otherwise it looks
        # not so good. The main diagonal is filled
        # with an array containing the max value found
        # in the matrix
        if sum(self.hic_ma.matrix.diagonal()) == 0:
            self.log.info(
                "Filling main diagonal with max value because it empty and looks bad...\n"
            )
            max_value = self.hic_ma.matrix.data.max()
            main_diagonal = scipy.sparse.dia_matrix(
                ([max_value] * self.hic_ma.matrix.shape[0], [0]),
                shape=self.hic_ma.matrix.shape)
            self.hic_ma.matrix = self.hic_ma.matrix + main_diagonal

        self.norm = None

        self.process_color('colormap',
                           colormap_possible=True,
                           colormap_only=True,
                           default_value_is_colormap=True)

        self.cmap = cm.get_cmap(self.properties['colormap'])
        self.cmap.set_bad('black')
Example #7
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)

    # parse from hicpro, homer, h5 and hic to cool
    if args.inputFormat != 'hic' and args.outputFormat != 'mcool':
        if len(args.matrices) != len(args.outFileName):
            log.error(
                'Number of input matrices does not match number output matrices!'
            )
            exit(1)
    if args.inputFormat == 'hic' and args.outputFormat == 'cool':
        log.info('Converting with hic2cool.')
        for i, matrix in enumerate(args.matrices):
            if args.resolutions is None:
                hic2cool_convert(matrix, args.outFileName[i], 0)
            else:

                for resolution in args.resolutions:
                    out_name = args.outFileName[i].split('.')
                    out_name[-2] = out_name[-2] + '_' + str(resolution)
                    out_name = '.'.join(out_name)
                    hic2cool_convert(matrix, out_name, resolution)
        return
    elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']:
        format_was_h5 = False
        if args.inputFormat == 'h5':
            format_was_h5 = True
        applyCorrection = True
        if args.store_applied_correction:
            applyCorrection = False
        if args.inputFormat == 'hicpro':
            if len(args.matrices) != len(args.bedFileHicpro):
                log.error(
                    'Number of matrices and associated bed files need to be the same.'
                )
                log.error('Matrices: {}; Bed files: {}'.format(
                    len(args.matrices), len(args.bedFileHicpro)))
                sys.exit(1)

        for i, matrix in enumerate(args.matrices):
            if args.inputFormat == 'hicpro':
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pBedFileHicPro=args.bedFileHicpro[i])
            else:
                correction_operator = None

                if args.correction_division:
                    correction_operator = '/'

                chromosomes_to_load = None
                if args.chromosome:
                    chromosomes_to_load = [args.chromosome]

                applyCorrectionCoolerLoad = True
                if args.load_raw_values:
                    applyCorrectionCoolerLoad = False
                matrixFileHandlerInput = MatrixFileHandler(
                    pFileType=args.inputFormat,
                    pMatrixFile=matrix,
                    pCorrectionFactorTable=args.correction_name,
                    pCorrectionOperator=correction_operator,
                    pChrnameList=chromosomes_to_load,
                    pEnforceInteger=args.enforce_integer,
                    pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad)

            _matrix, cut_intervals, nan_bins, \
                distance_counts, correction_factors = matrixFileHandlerInput.load()

            log.debug('Setting done')

            if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']:
                if args.outputFormat in ['homer', 'ginteractions']:
                    # make it a upper triangular matrix in case it is not already
                    _matrix = triu(_matrix)
                    # make it a full symmetrical matrix
                    _matrix = _matrix.maximum(_matrix.T)
                matrixFileHandlerOutput = MatrixFileHandler(
                    pFileType=args.outputFormat,
                    pEnforceInteger=args.enforce_integer,
                    pFileWasH5=format_was_h5)

                matrixFileHandlerOutput.set_matrix_variables(
                    _matrix, cut_intervals, nan_bins, correction_factors,
                    distance_counts)
                matrixFileHandlerOutput.save(args.outFileName[i],
                                             pSymmetric=True,
                                             pApplyCorrection=applyCorrection)
            elif args.outputFormat in ['mcool']:

                log.debug('outformat is mcool')
                if args.resolutions and len(args.matrices) > 1:
                    log.error(
                        'Please define one matrix and many resolutions which should be created or multiple matrices.'
                    )
                if args.resolutions:
                    log.info(
                        'Correction factors are removed. They are not valid for any new created resolution.'
                    )
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)

                    bin_size = hic_matrix.getBinSize()

                    for j, resolution in enumerate(args.resolutions):
                        hic_matrix_res = deepcopy(hic_matrix)

                        _mergeFactor = int(resolution) // bin_size

                        log.debug('bin size {}'.format(bin_size))
                        log.debug('_mergeFactor {}'.format(_mergeFactor))
                        if int(resolution) != bin_size:
                            merged_matrix = hicMergeMatrixBins.merge_bins(
                                hic_matrix_res, _mergeFactor)
                        else:
                            merged_matrix = hic_matrix_res
                        append = False
                        if j > 0:
                            append = True
                        matrixFileHandlerOutput = MatrixFileHandler(
                            pFileType='cool',
                            pEnforceInteger=args.enforce_integer,
                            pAppend=append,
                            pFileWasH5=format_was_h5)

                        matrixFileHandlerOutput.set_matrix_variables(
                            merged_matrix.matrix, merged_matrix.cut_intervals,
                            merged_matrix.nan_bins,
                            merged_matrix.correction_factors,
                            merged_matrix.distance_counts)
                        matrixFileHandlerOutput.save(
                            args.outFileName[0] + '::/resolutions/' +
                            str(resolution),
                            pSymmetric=True,
                            pApplyCorrection=applyCorrection)

                else:
                    append = False
                    if i > 0:
                        append = True
                    hic_matrix = HiCMatrix.hiCMatrix()
                    hic_matrix.setMatrix(_matrix, cut_intervals)
                    bin_size = hic_matrix.getBinSize()
                    matrixFileHandlerOutput = MatrixFileHandler(
                        pFileType='cool',
                        pAppend=append,
                        pFileWasH5=format_was_h5)

                    matrixFileHandlerOutput.set_matrix_variables(
                        _matrix, cut_intervals, nan_bins, correction_factors,
                        distance_counts)
                    matrixFileHandlerOutput.save(
                        args.outFileName[0] + '::/resolutions/' +
                        str(bin_size),
                        pSymmetric=True,
                        pApplyCorrection=applyCorrection)
Example #8
0
def computeInterIntraTADs(pMatrix, pDomainList, pCoolOrH5, pThreadId, pQueue):
    try:

        inter_left_sum_list = []
        inter_right_sum_list = []
        inter_left_densit_list = []
        inter_right_density_list = []
        inter_left_number_of_contacts_list = []
        inter_right_number_of_contacts_list = []
        inter_left_number_of_contacts_nnz_list = []
        inter_right_number_of_contacts_nzz_list = []

        intra_sum_list = []
        intra_number_of_contacts_list = []
        intra_number_of_contacts_nnz_list = []
        intra_density_list = []
        inter_left_intra_ratio_list = []
        inter_right_intra_ratio_list = []
        inter_left_inter_right_intra_ratio_list = []

        rows = []

        chromosome_list = pDomainList
        for i, row in enumerate(chromosome_list):

            if pThreadId is None:
                log.debug('first thread')
                if i == len(chromosome_list) - 1:
                    continue
            elif pThreadId == True:
                log.debug('middle thread')

                if i == 0 or i == len(chromosome_list) - 1:
                    log.debug('i: {}'.format(i))
                    log.debug('len(chromosome_list): {}'.format(
                        len(chromosome_list)))

                    continue
            elif pThreadId == False:
                log.debug('last thread')

                if i == 0:
                    continue

            if i - 1 >= 0:
                chromosom = chromosome_list[i - 1][0]
                start = chromosome_list[i - 1][1]
            else:
                chromosom = chromosome_list[i][0]
                start = chromosome_list[i][1]
            if i + 1 < len(chromosome_list):
                end = chromosome_list[i + 1][2]
            else:
                end = chromosome_list[i][2]
            # midpos = row[1] + ((row[2] - row[1]) / 2)

            if pCoolOrH5:

                # # get intra-TAD data
                hic_matrix = hm.hiCMatrix(pMatrixFile=pMatrix,
                                          pChrnameList=[
                                              str(row[0]) + ':' + str(row[1]) +
                                              '-' + str(row[2])
                                          ])
                matrix = hic_matrix.matrix

                hic_matrix_inter_tad = hm.hiCMatrix(
                    pMatrixFile=pMatrix,
                    pChrnameList=[
                        str(chromosom) + ':' + str(start) + '-' + str(end)
                    ])

                matrix_inter_tad = hic_matrix_inter_tad.matrix

            else:
                # in case of h5 pMatrixTarget is already a HiCMatrix object
                hic_matrix = pMatrix
                hic_matrix_inter_tad = pMatrix
                indices = hic_matrix.getRegionBinRange(str(row[0]), row[1],
                                                       row[2])

                matrix = hic_matrix.matrix[indices[0]:indices[1],
                                           indices[0]:indices[1]]
                matrix_inter_tad = pMatrix.matrix

            # matrix = matrix.flatten()

            # get index position left tad with tad
            left_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                str(chromosom), row[1], row[1])[0]
            if pCoolOrH5:
                outer_left_boundary_index = 0

                outer_right_boundary_index = -1

            else:
                outer_left_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                    str(chromosom), start, end)[0]

                outer_right_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                    str(chromosom), start, end)[1]

            if i + 1 < len(chromosome_list) and not pCoolOrH5:
                # get index position right tad with tad
                right_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                    str(chromosom), row[2], row[2])[0]
            elif i + 1 < len(chromosome_list):
                right_boundary_index = hic_matrix_inter_tad.getRegionBinRange(
                    str(chromosom), row[2], row[2])[0]

            if i - 1 >= 0 and i + 1 < len(chromosome_list):
                intertad_left = matrix_inter_tad[
                    outer_left_boundary_index:left_boundary_index,
                    left_boundary_index:right_boundary_index]
                intertad_right = matrix_inter_tad[
                    left_boundary_index:right_boundary_index,
                    right_boundary_index:outer_right_boundary_index]

            elif i - 1 < 0 and i + 1 < len(chromosome_list):
                intertad_right = matrix_inter_tad[
                    left_boundary_index:right_boundary_index,
                    right_boundary_index:outer_right_boundary_index]

            elif i - 1 > 0 and i + 1 >= len(chromosome_list):
                intertad_left = matrix_inter_tad[
                    outer_left_boundary_index:left_boundary_index,
                    left_boundary_index:right_boundary_index]

            inter_left_sum = 0
            inter_right_sum = 0
            inter_left_density = 0
            inter_right_density = 0
            inter_left_number_of_contacts = 0
            inter_right_number_of_contacts = 0
            inter_left_number_of_contacts_nnz = 0
            inter_right_number_of_contacts_nzz = 0

            intra_sum = matrix.sum()
            intra_number_of_contacts = matrix.shape[0] * matrix.shape[1]
            intra_number_of_contacts_nnz = matrix.nnz
            intra_density = intra_number_of_contacts_nnz / intra_number_of_contacts
            # both inter, left and right is available
            if i - 1 >= 0 and i + 1 < len(chromosome_list):
                # intertad_left = intertad_left.flatten()
                # intertad_right = intertad_right.flatten()
                inter_left_sum = intertad_left.sum()
                inter_right_sum = intertad_right.sum()

                inter_left_number_of_contacts = intertad_left.shape[
                    0] * intertad_left.shape[1]
                inter_right_number_of_contacts = intertad_right.shape[
                    0] * intertad_right.shape[1]
                inter_left_number_of_contacts_nnz = intertad_left.nnz
                inter_right_number_of_contacts_nzz = intertad_right.nnz

                inter_left_density = inter_left_number_of_contacts_nnz / inter_left_number_of_contacts
                inter_right_density = inter_right_number_of_contacts_nzz / inter_right_number_of_contacts
                # statistic_left, significance_level_left = ranksums(intertad_left, intertad_left_control)
                # statistic_right, significance_level_right = ranksums(intertad_right, intertad_right_control)
            elif i - 1 < 0 and i + 1 < len(chromosome_list):
                # inter right is available
                # intertad_right = intertad_right.flatten()
                inter_right_sum = intertad_right.sum()
                inter_right_number_of_contacts = intertad_right.shape[
                    0] * intertad_right.shape[1]
                inter_right_number_of_contacts_nzz = intertad_right.nnz
                inter_right_density = inter_right_number_of_contacts_nzz / inter_right_number_of_contacts

                # statistic_right, significance_level_right = ranksums(intertad_right, intertad_right_control)
            elif i - 1 > 0 and i + 1 >= len(chromosome_list):
                # inter left is available

                # intertad_left = intertad_left.flatten()
                inter_left_sum = intertad_left.sum()
                inter_left_number_of_contacts = intertad_left.shape[
                    0] * intertad_left.shape[1]
                inter_left_number_of_contacts_nnz = intertad_left.nnz
                inter_left_density = inter_left_number_of_contacts_nnz / inter_left_number_of_contacts

                # statistic_left, significance_level_left = ranksums(intertad_left, intertad_left_control)

            inter_left_intra_ratio = inter_left_sum / intra_sum
            inter_right_intra_ratio = inter_right_sum / intra_sum
            inter_left_inter_right_intra_ratio = (inter_left_sum +
                                                  inter_right_sum) / intra_sum

            inter_left_sum_list.append(inter_left_sum)
            inter_right_sum_list.append(inter_right_sum)
            inter_left_densit_list.append(inter_left_density)
            inter_right_density_list.append(inter_right_density)
            inter_left_number_of_contacts_list.append(
                inter_left_number_of_contacts)
            inter_right_number_of_contacts_list.append(
                inter_right_number_of_contacts)
            inter_left_number_of_contacts_nnz_list.append(
                inter_left_number_of_contacts_nnz)
            inter_right_number_of_contacts_nzz_list.append(
                inter_right_number_of_contacts_nzz)

            intra_sum_list.append(intra_sum)
            intra_number_of_contacts_list.append(intra_number_of_contacts)
            intra_number_of_contacts_nnz_list.append(
                intra_number_of_contacts_nnz)
            intra_density_list.append(intra_density)
            inter_left_intra_ratio_list.append(inter_left_intra_ratio)
            inter_right_intra_ratio_list.append(inter_right_intra_ratio)
            inter_left_inter_right_intra_ratio_list.append(
                inter_left_inter_right_intra_ratio)

            rows.append(row)
    except Exception as exp:
        pQueue.put('Fail: ' + str(exp) + traceback.format_exc())
        return
    pQueue.put([
        inter_left_sum_list, inter_right_sum_list, inter_left_densit_list,
        inter_right_density_list, inter_left_number_of_contacts_list,
        inter_right_number_of_contacts_list,
        inter_left_number_of_contacts_nnz_list,
        inter_right_number_of_contacts_nzz_list, intra_sum_list,
        intra_number_of_contacts_list, intra_number_of_contacts_nnz_list,
        intra_density_list, inter_left_intra_ratio_list,
        inter_right_intra_ratio_list, inter_left_inter_right_intra_ratio_list,
        rows
    ])
def test_build_matrix_cooler_multiple():
    outfile = NamedTemporaryFile(suffix='.cool', delete=False)
    outfile.close()
    qc_folder = mkdtemp(prefix="testQC_")
    args = "-s {} {} --outFileName {} -bs 5000 10000 20000 -b /tmp/test.bam --QCfolder {} --threads 4".format(
        sam_R1, sam_R2, outfile.name, qc_folder).split()
    hicBuildMatrix.main(args)

    test_5000 = hm.hiCMatrix(
        ROOT +
        "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/5000")
    test_10000 = hm.hiCMatrix(
        ROOT +
        "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/10000")
    test_20000 = hm.hiCMatrix(
        ROOT +
        "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/20000")

    new_5000 = hm.hiCMatrix(outfile.name + '::/resolutions/5000')
    new_10000 = hm.hiCMatrix(outfile.name + '::/resolutions/10000')
    new_20000 = hm.hiCMatrix(outfile.name + '::/resolutions/20000')

    nt.assert_equal(test_5000.matrix.data, new_5000.matrix.data)
    nt.assert_equal(test_10000.matrix.data, new_10000.matrix.data)
    nt.assert_equal(test_20000.matrix.data, new_20000.matrix.data)

    # nt.assert_equal(test.cut_intervals, new.cut_intervals)
    nt.assert_equal(len(new_5000.cut_intervals), len(test_5000.cut_intervals))
    nt.assert_equal(len(new_10000.cut_intervals),
                    len(test_10000.cut_intervals))
    nt.assert_equal(len(new_20000.cut_intervals),
                    len(test_20000.cut_intervals))

    cut_interval_new_ = []
    cut_interval_test_ = []
    for x in new_5000.cut_intervals:
        cut_interval_new_.append(x[:3])
    for x in test_5000.cut_intervals:
        cut_interval_test_.append(x[:3])

    nt.assert_equal(cut_interval_new_, cut_interval_test_)

    cut_interval_new_ = []
    cut_interval_test_ = []
    for x in new_10000.cut_intervals:
        cut_interval_new_.append(x[:3])
    for x in test_10000.cut_intervals:
        cut_interval_test_.append(x[:3])

    nt.assert_equal(cut_interval_new_, cut_interval_test_)

    cut_interval_new_ = []
    cut_interval_test_ = []
    for x in new_20000.cut_intervals:
        cut_interval_new_.append(x[:3])
    for x in test_20000.cut_intervals:
        cut_interval_test_.append(x[:3])

    nt.assert_equal(cut_interval_new_, cut_interval_test_)
    # print(set(os.listdir(ROOT + "QC/")))
    assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log")
    assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder))

    os.unlink(outfile.name)
    shutil.rmtree(qc_folder)
Example #10
0
def adjustMatrix(pArgs):
    if pArgs.chromosomes is not None and pArgs.regions is not None:
        log.error('Please specify either --chromosomes or --regions.')
        exit(1)
    hic_matrix = None
    if pArgs.chromosomes:

        if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep':
            chromosomes_list = cooler.Cooler(pArgs.matrix).chromnames
            if pArgs.chromosomes[0] in chromosomes_list:
                hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes)
            else:
                log.error('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, pArgs.chromosomes[0]))
                exit(1)
        else:
            hic_matrix = hm.hiCMatrix(pArgs.matrix)

        chromosomes_list = list(hic_matrix.chrBinBoundaries)
        chromosomes_list_to_operate_on = []
        for chromosome in pArgs.chromosomes:
            if chromosome in chromosomes_list:
                chromosomes_list_to_operate_on.append(chromosome)
            else:
                log.warning('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, chromosome))
        if len(chromosomes_list_to_operate_on) == 0:
            log.error('No valid chromosome given: {}. Available: {}'.format(pArgs.chromosomes, chromosomes_list))
            exit(1)
        if pArgs.action == 'keep':
            hic_matrix.reorderChromosomes(chromosomes_list_to_operate_on)
        elif pArgs.action == 'remove':
            # chromosomes = list(hic_matrix.chrBinBoundaries)
            for chromosome in chromosomes_list:
                if chromosome in chromosomes_list_to_operate_on:
                    chromosomes_list.remove(chromosome)
            hic_matrix.reorderChromosomes(chromosomes_list)
        elif pArgs.action == 'mask':
            hic_matrix.maskChromosomes(chromosomes_list_to_operate_on)
    elif pArgs.regions:
        hic_matrix = hm.hiCMatrix(pArgs.matrix)
        chromosomes_list = list(hic_matrix.chrBinBoundaries)
        genomic_regions = []
        with open(pArgs.regions, 'r') as file:
            for line in file.readlines():
                _line = line.strip().split('\t')
                if len(line) < 3:
                    log.warning("An entry shorter than 3 columns has been found!")
                    continue
                if len(_line) >= 3:
                    chrom, start, end = _line[0], int(_line[1]), int(_line[2])
                    if chrom in chromosomes_list:
                        genomic_regions.append((chrom, start, end))
                    else:
                        log.warning('Chromosome not available in matrix, ignoring regions: {} {}'.format(pArgs.matrix, chrom))
        if len(genomic_regions) == 0:
            log.error('No valid chromosome given. Available: {}'.format(chromosomes_list))
            exit(1)
        # log.debug('genomic_regions {}'.format(genomic_regions))
        matrix_indices_regions = []
        for region in genomic_regions:
            _regionBinRange = hic_matrix.getRegionBinRange(region[0], region[1], region[2])
            if _regionBinRange is not None:
                start, end = _regionBinRange
                matrix_indices_regions.extend(list(range(start, end)))

        # log.debug('matrix_indices_regions {}'.format(matrix_indices_regions))
        if pArgs.action == 'keep':
            values_submatrix = matrix_indices_regions
            instances, features = hic_matrix.matrix.nonzero()
            mask = np.isin(instances, values_submatrix)
            mask = np.logical_not(mask)
            hic_matrix.matrix.data[mask] = 0
            hic_matrix.matrix.eliminate_zeros()
        elif pArgs.action == 'mask':
            hic_matrix.maskBins(matrix_indices_regions)

        elif pArgs.action == 'remove':

            full_matrix_range = np.array(range(0, max(hic_matrix.matrix.shape[0], hic_matrix.matrix.shape[1])))
            matrix_indices_regions = np.array(matrix_indices_regions)
            full_matrix_range[matrix_indices_regions] = -1
            mask = full_matrix_range != -1
            full_matrix_range = full_matrix_range[mask]

            hic_matrix.reorderBins(full_matrix_range)
    elif pArgs.maskBadRegions:
        if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep':
            hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes)
        else:
            hic_matrix = hm.hiCMatrix(pArgs.matrix)

    else:
        log.info('No data to adjust given. Please specify either --chromosomes or --region parameter.')

    return hic_matrix
Example #11
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    mpl.rcParams['pdf.fonttype'] = 42

    # read domains file
    domains_df = readDomainBoundaries(args.tadDomains)
    # log.debug('len(domains_df) {}'.format(len(domains_df)))
    domains = domains_df.values.tolist()
    old_chromosome = None

    tads_per_chromosome = []

    for j in range(len(domains)):
        if old_chromosome is None:
            old_chromosome = domains[j][0]
            per_chromosome = []
            per_chromosome.append(domains[j])

        elif old_chromosome == domains[j][0]:
            per_chromosome.append(domains[j])
            continue
        else:
            tads_per_chromosome.append(per_chromosome)
            per_chromosome = []
            per_chromosome.append(domains[j])
            old_chromosome = domains[j][0]
    tads_per_chromosome.append(per_chromosome)

    # read full h5 or only region if cooler
    is_cooler = check_cooler(args.matrix)

    if not is_cooler:
        hic_matrix = hm.hiCMatrix(args.matrix)
    else:
        hic_matrix = args.matrix

    inter_left_sum_list_chromosomes = []
    inter_right_sum_list_chromosomes = []
    inter_left_density_list_chromosomes = []
    inter_right_density_list_chromosomes = []
    inter_left_number_of_contacts_list_chromosomes = []
    inter_right_number_of_contacts_list_chromosomes = []
    inter_left_number_of_contacts_nnz_list_chromosomes = []
    inter_right_number_of_contacts_nzz_list_chromosomes = []

    intra_sum_list_chromosomes = []
    intra_number_of_contacts_list_chromosomes = []
    intra_number_of_contacts_nnz_list_chromosomes = []
    intra_density_list_chromosomes = []
    inter_left_intra_ratio_list_chromosomes = []
    inter_right_intra_ratio_list_chromosomes = []
    inter_left_inter_right_intra_ratio_list_chromosomes = []

    rows_chromosomes = []

    inter_left_sum_list_threads = [[]] * args.threads
    inter_right_sum_list_threads = [[]] * args.threads
    inter_left_density_list_threads = [[]] * args.threads
    inter_right_density_list_threads = [[]] * args.threads
    inter_left_number_of_contacts_list_threads = [[]] * args.threads
    inter_right_number_of_contacts_list_threads = [[]] * args.threads
    inter_left_number_of_contacts_nnz_list_threads = [[]] * args.threads
    inter_right_number_of_contacts_nzz_list_threads = [[]] * args.threads

    intra_sum_list_threads = [[]] * args.threads
    intra_number_of_contacts_list_threads = [[]] * args.threads
    intra_number_of_contacts_nnz_list_threads = [[]] * args.threads
    intra_density_list_threads = [[]] * args.threads
    inter_left_intra_ratio_list_threads = [[]] * args.threads
    inter_right_intra_ratio_list_threads = [[]] * args.threads
    inter_left_inter_right_intra_ratio_list_threads = [[]] * args.threads

    rows_threads = [[]] * args.threads

    threads_save = deepcopy(args.threads)
    for chromosome in tads_per_chromosome:
        # log.debug('tads_per_chromosome {}'.format(chromosome))
        domainsPerThread = len(chromosome) // args.threads
        if domainsPerThread == 0 and len(chromosome) > 0:
            domainsPerThread = 1
            args.threads = 1
        elif domainsPerThread > 0:
            args.threads = threads_save

        all_data_collected = False
        queue = [None] * args.threads
        process = [None] * args.threads
        thread_done = [False] * args.threads
        # None --> first thread, process first element in list, ignore last one
        # True --> middle thread: ignore first and last element in tad processing
        # False --> last thread: ignore first element, process last one
        thread_id = None
        for i in range(args.threads):

            if args.threads == 1:
                domainListThread = chromosome

            elif i == 0:
                domainListThread = chromosome[i * domainsPerThread:(
                    (i + 1) * domainsPerThread) + 1]
                thread_id = None
            elif i < args.threads - 1:
                domainListThread = chromosome[(i * domainsPerThread) -
                                              1:((i + 1) * domainsPerThread) +
                                              1]
                thread_id = True

            else:
                domainListThread = chromosome[(i * domainsPerThread) - 1:]
                thread_id = False

            if args.threads == 1:
                thread_id = ''

            # log.debug('len(domainListThread) {}'.format(len(domainListThread)))
            # log.debug('len(thread_id) {}'.format(thread_id))

            queue[i] = Queue()
            process[i] = Process(
                target=computeInterIntraTADs,
                kwargs=dict(
                    pMatrix=hic_matrix,
                    # pMatrixControl=hic_matrix_control,
                    pDomainList=domainListThread,
                    pCoolOrH5=is_cooler,
                    # pPValue=args.pValue,
                    pThreadId=thread_id,
                    pQueue=queue[i]))

            process[i].start()
        fail_flag = False
        fail_message = ''
        while not all_data_collected:
            for i in range(args.threads):

                if queue[i] is not None and not queue[i].empty():
                    queue_data = queue[i].get()
                    if 'Fail:' in queue_data:
                        fail_flag = True
                        fail_message = queue_data
                    else:
                        inter_left_sum_list_threads[i], \
                            inter_right_sum_list_threads[i], \
                            inter_left_density_list_threads[i], \
                            inter_right_density_list_threads[i], \
                            inter_left_number_of_contacts_list_threads[i], \
                            inter_right_number_of_contacts_list_threads[i], \
                            inter_left_number_of_contacts_nnz_list_threads[i], \
                            inter_right_number_of_contacts_nzz_list_threads[i], \
                            intra_sum_list_threads[i], \
                            intra_number_of_contacts_list_threads[i], \
                            intra_number_of_contacts_nnz_list_threads[i], \
                            intra_density_list_threads[i], \
                            inter_left_intra_ratio_list_threads[i], \
                            inter_right_intra_ratio_list_threads[i], \
                            inter_left_inter_right_intra_ratio_list_threads[i], \
                            rows_threads[i] = queue_data

                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
                # elif queue[i] is None and

            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        if fail_flag:
            log.error(fail_message[6:])
            exit(1)

        inter_left_sum_list_chromosomes.append([
            item for sublist in inter_left_sum_list_threads for item in sublist
        ])
        inter_right_sum_list_chromosomes.append([
            item for sublist in inter_right_sum_list_threads
            for item in sublist
        ])
        inter_left_density_list_chromosomes.append([
            item for sublist in inter_left_density_list_threads
            for item in sublist
        ])
        inter_right_density_list_chromosomes.append([
            item for sublist in inter_right_density_list_threads
            for item in sublist
        ])
        inter_left_number_of_contacts_list_chromosomes.append([
            item for sublist in inter_left_number_of_contacts_list_threads
            for item in sublist
        ])
        inter_right_number_of_contacts_list_chromosomes.append([
            item for sublist in inter_right_number_of_contacts_list_threads
            for item in sublist
        ])
        inter_left_number_of_contacts_nnz_list_chromosomes.append([
            item for sublist in inter_left_number_of_contacts_nnz_list_threads
            for item in sublist
        ])
        inter_right_number_of_contacts_nzz_list_chromosomes.append([
            item for sublist in inter_right_number_of_contacts_nzz_list_threads
            for item in sublist
        ])

        intra_sum_list_chromosomes.append(
            [item for sublist in intra_sum_list_threads for item in sublist])
        intra_number_of_contacts_list_chromosomes.append([
            item for sublist in intra_number_of_contacts_list_threads
            for item in sublist
        ])
        intra_number_of_contacts_nnz_list_chromosomes.append([
            item for sublist in intra_number_of_contacts_nnz_list_threads
            for item in sublist
        ])
        intra_density_list_chromosomes.append([
            item for sublist in intra_density_list_threads for item in sublist
        ])
        inter_left_intra_ratio_list_chromosomes.append([
            item for sublist in inter_left_intra_ratio_list_threads
            for item in sublist
        ])
        inter_right_intra_ratio_list_chromosomes.append([
            item for sublist in inter_right_intra_ratio_list_threads
            for item in sublist
        ])
        inter_left_inter_right_intra_ratio_list_chromosomes.append([
            item for sublist in inter_left_inter_right_intra_ratio_list_threads
            for item in sublist
        ])

        rows_chromosomes.append(
            [item for sublist in rows_threads for item in sublist])

    inter_left_sum_list = [
        item for sublist in inter_left_sum_list_chromosomes for item in sublist
    ]
    inter_right_sum_list = [
        item for sublist in inter_right_sum_list_chromosomes
        for item in sublist
    ]
    inter_left_density_list = [
        item for sublist in inter_left_density_list_chromosomes
        for item in sublist
    ]
    inter_right_density_list = [
        item for sublist in inter_right_density_list_chromosomes
        for item in sublist
    ]
    inter_left_number_of_contacts_list = [
        item for sublist in inter_left_number_of_contacts_list_chromosomes
        for item in sublist
    ]
    inter_right_number_of_contacts_list = [
        item for sublist in inter_right_number_of_contacts_list_chromosomes
        for item in sublist
    ]
    inter_left_number_of_contacts_nnz_list = [
        item for sublist in inter_left_number_of_contacts_nnz_list_chromosomes
        for item in sublist
    ]
    inter_right_number_of_contacts_nzz_list = [
        item for sublist in inter_right_number_of_contacts_nzz_list_chromosomes
        for item in sublist
    ]

    intra_sum_list = [
        item for sublist in intra_sum_list_chromosomes for item in sublist
    ]
    intra_number_of_contacts_list = [
        item for sublist in intra_number_of_contacts_list_chromosomes
        for item in sublist
    ]
    intra_number_of_contacts_nnz_list = [
        item for sublist in intra_number_of_contacts_nnz_list_chromosomes
        for item in sublist
    ]
    intra_density_list = [
        item for sublist in intra_density_list_chromosomes for item in sublist
    ]
    inter_left_intra_ratio_list = [
        item for sublist in inter_left_intra_ratio_list_chromosomes
        for item in sublist
    ]
    inter_right_intra_ratio_list = [
        item for sublist in inter_right_intra_ratio_list_chromosomes
        for item in sublist
    ]
    inter_left_inter_right_intra_ratio_list = [
        item for sublist in inter_left_inter_right_intra_ratio_list_chromosomes
        for item in sublist
    ]

    rows = [item for sublist in rows_chromosomes for item in sublist]

    with open(args.outFileName, 'w') as file:
        header = '# Created with HiCExplorer\'s hicInterIntraTAD version ' + __version__ + '\n'
        header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tinter_left_sum\tinter_right_sum\tinter_left_density\tinter_right_density\tinter_left_number_of_contacts\tinter_right_number_of_contacts\t'  \
            'inter_left_number_of_contacts_nnz\tinter_right_number_of_contacts_nnz\tintra_sum\tintra_number_of_contacts\tintra_number_of_contacts_nnz\tintra_density\tinter_left_intra_ratio\tinter_right_intra_ratio\tinter_left_inter_right_intra_ratio\n'
        file.write(header)
        for i, row in enumerate(rows):
            row_list = list(map(str, row))

            file.write('\t'.join(row_list))

            file.write('\t{}'.format(inter_left_sum_list[i]))
            file.write('\t{}'.format(inter_right_sum_list[i]))
            file.write('\t{}'.format(inter_left_density_list[i]))
            file.write('\t{}'.format(inter_right_density_list[i]))
            file.write('\t{}'.format(inter_left_number_of_contacts_list[i]))
            file.write('\t{}'.format(inter_right_number_of_contacts_list[i]))
            file.write('\t{}'.format(
                inter_left_number_of_contacts_nnz_list[i]))
            file.write('\t{}'.format(
                inter_right_number_of_contacts_nzz_list[i]))
            file.write('\t{}'.format(intra_sum_list[i]))
            file.write('\t{}'.format(intra_number_of_contacts_list[i]))
            file.write('\t{}'.format(intra_number_of_contacts_nnz_list[i]))
            file.write('\t{}'.format(intra_density_list[i]))
            file.write('\t{}'.format(inter_left_intra_ratio_list[i]))
            file.write('\t{}'.format(inter_right_intra_ratio_list[i]))
            file.write('\t{}'.format(
                inter_left_inter_right_intra_ratio_list[i]))

            file.write('\n')

    plt.scatter(inter_left_intra_ratio_list,
                inter_right_intra_ratio_list,
                s=20,
                alpha=0.7)
    plt.xlabel('Inter-left/intra TAD contact ratio', fontsize=args.fontsize)
    plt.ylabel('Inter-right/intra TAD contact ratio', fontsize=args.fontsize)
    plt.tight_layout()
    plt.savefig(args.outFileNameRatioPlot, dpi=args.dpi)
    plt.close()
def main(args=None):

    args = parse_arguments().parse_args(args)

    hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix)
    indices_values = []

    with open(args.regions, 'r') as file:
        for line in file.readlines():
            _line = line.strip().split('\t')
            if len(line) == 0:
                continue
            if len(_line) == 2:
                chrom, start = _line[0], _line[1]

                viewpoint = (chrom, start, start)
            elif len(_line) >= 3:
                chrom, start, end = _line[0], _line[1], _line[2]
                if args.considerStrandDirection and len(_line) < 6:
                    log.error(
                        'Strand orientation should be considered but file does not contain the 6th column of the bed file containing this information. Exiting!'
                    )
                    exit(1)

                viewpoint = (chrom, start, end)
            if args.range:
                start_range_genomic, end_range_genomic, start_out, end_out = calculateViewpointRange(
                    hic_ma, viewpoint, args.range,
                    args.coordinatesToBinMapping)
                start_bin, end_bin = getBinIndices(
                    hic_ma, (chrom, start_range_genomic, end_range_genomic))
            else:
                start_bin, end_bin, start_out, end_out = calculateViewpointRangeBins(
                    hic_ma, viewpoint, args.rangeInBins,
                    args.coordinatesToBinMapping)
            if args.considerStrandDirection:
                indices_values.append(
                    [start_bin, end_bin, start_out, end_out, _line[5]])

            else:
                indices_values.append(
                    [start_bin, end_bin, start_out, end_out, None])

    if args.range:
        dimensions_new_matrix = (args.range[0] // hic_ma.getBinSize()) + (
            args.range[1] // hic_ma.getBinSize())
    elif args.rangeInBins:
        dimensions_new_matrix = args.rangeInBins[0] + args.rangeInBins[1]

    summed_matrix = lil_matrix((dimensions_new_matrix, dimensions_new_matrix),
                               dtype=np.float32)
    count_matrix = np.zeros(shape=(dimensions_new_matrix,
                                   dimensions_new_matrix))

    # max_length = hic_ma.matrix.shape[1]
    for start, end, start_out, end_out, orientation in indices_values:
        _start = 0
        _end = summed_matrix.shape[1]
        # if start < 0:
        #     _start = np.absolute(start)
        #     start = 0
        # if end >= max_length:
        #     _end = end
        #     end = max_length
        orig_matrix_length = end - start
        if start_out:
            _start = _end - orig_matrix_length
        if end_out:
            _end = start + orig_matrix_length
        submatrix = hic_ma.matrix[start:end, start:end]
        if summed_matrix.shape != submatrix.shape:
            log.warning('Shape of a submatrix does not match. It is ignored.')
            log.warning('Region: {}'.format(hic_ma.getBinPos(start)))
            continue
        count_matrix[_start:_end, _start:_end] += 1

        if orientation is None or orientation == '+':
            summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end,
                                                                     start:end]
        elif orientation == '-':

            summed_matrix[_start:_end,
                          _start:_end] += hic_ma.matrix[start:end, start:end].T
    summed_matrix /= count_matrix
    summed_matrix = np.array(summed_matrix)
    data = summed_matrix[np.nonzero(summed_matrix)]
    row = np.nonzero(summed_matrix)[0]
    col = np.nonzero(summed_matrix)[1]
    summed_matrix = csr_matrix(
        (data, (row, col)),
        shape=(dimensions_new_matrix, dimensions_new_matrix))
    save_npz(args.outFileName, summed_matrix)
Example #13
0
def mergeLoops(pDataFrame, pLowestResolution, pTupleX, pTupleY):
    hic = hm.hiCMatrix()
    target_regions_intervaltree_x = hic.intervalListToIntervalTree(pTupleX)[0]
    target_regions_intervaltree_y = hic.intervalListToIntervalTree(pTupleY)[0]

    for i, loop in enumerate(pDataFrame.values):
        # neighborhood factor to extent the search range. This allows to consider the smaller bin sizes
        # like they would be bins of the lowest resolution
        neighborhood_factor_x = int(pLowestResolution) - abs(
            int(loop[2]) - int(loop[1]))
        neighborhood_factor_y = int(pLowestResolution) - abs(
            int(loop[5]) - int(loop[4]))

        if loop[0] in target_regions_intervaltree_x:
            x_interval = target_regions_intervaltree_x[loop[0]].overlap(
                loop[1] - neighborhood_factor_x - 1,
                loop[2] + neighborhood_factor_x + 1)
        if loop[3] in target_regions_intervaltree_y:
            y_interval = target_regions_intervaltree_y[loop[0]].overlap(
                loop[4] - neighborhood_factor_y - 1,
                loop[5] + neighborhood_factor_y + 1)

        if len(x_interval) <= 1 or len(y_interval) <= 1:
            continue

        dict_of_interest_x = {}
        list_of_interest = []
        for data in x_interval:
            dict_of_interest_x[data[2]] = [data[0], data[1]]
        for data in y_interval:
            if data[2] in dict_of_interest_x:
                list_of_interest.append(data)

        max_index = 0
        max_distance = 0
        all_id_list = []
        for data in list_of_interest:
            if abs(data[0] - data[1]) > max_distance:
                max_distance = abs(data[0] - data[1])
                max_index = data[2]
            all_id_list.append(data[2])
        for data in x_interval:
            if data[2] == max_index:
                continue
            if data[2] not in all_id_list:
                continue
            target_regions_intervaltree_x[loop[0]].remove(data)

        for data in y_interval:
            if data[2] == max_index:
                continue
            if data[2] not in all_id_list:
                continue
            target_regions_intervaltree_y[loop[0]].remove(data)

    result_list_index = []
    dict_x = {}
    dict_y = {}
    for chromosome_x, chromosome_y in zip(target_regions_intervaltree_x,
                                          target_regions_intervaltree_y):
        target_regions_intervaltree_x[chromosome_x] = sorted(
            target_regions_intervaltree_x[chromosome_x])
        target_regions_intervaltree_y[chromosome_y] = sorted(
            target_regions_intervaltree_y[chromosome_y])

        for x in target_regions_intervaltree_x[chromosome_x]:
            dict_x[x[2]] = (x[0], x[1])
        for y in target_regions_intervaltree_y[chromosome_y]:
            dict_y[y[2]] = (y[0], y[1])
        for x in dict_x:
            if x in dict_y:
                result_list_index.append(x)

        dict_x = None
        dict_x = {}
        dict_y = None
        dict_y = {}
    return result_list_index
Example #14
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outFileName.endswith('.h5') or args.outFileName.endswith(
            '.cool'):
        log.error('Output filetype not known.')
        log.error('It is: {}'.format(args.outFileName))
        log.error('Accepted is .h5 or .cool')
        exit(1)

    hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix)
    log.info("hic_ma.matrix: {}".format(hic_ma.matrix))
    if args.chromosomes:
        hic_ma.keepOnlyTheseChr(args.chromosomes)

    length_chromosome = 0
    chromosome_count = len(hic_ma.getChrNames())
    for chrname in hic_ma.getChrNames():
        chr_range = hic_ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    trasf_matrix = lil_matrix(hic_ma.matrix.shape)

    if args.method == 'norm':
        trasf_matrix = lil_matrix(hic_ma.matrix.shape)
        # trasf_matrix_pearson = lil_matrix(hic_ma.matrix.shape)
        # trasf_matrix_corr = lil_matrix(hic_ma.matrix.shape)

        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]

            submatrix.astype(float)
            submatrix = _obs_exp_norm(submatrix, length_chromosome,
                                      chromosome_count)

            submatrix = __pearson(submatrix)
            trasf_matrix[chr_range[0]:chr_range[1],
                         chr_range[0]:chr_range[1]] = lil_matrix(submatrix)

        # hic_ma.setMatrix(trasf_matrix.tocsr(), cut_intervals=hic_ma.cut_intervals)
        # hic_ma.save('obs_norm_pearson.'+ args.outFileName, pSymmetric=False, pApplyCorrection=False)

    elif args.method == 'obs_exp':
        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            submatrix.astype(float)
            trasf_matrix[chr_range[0]:chr_range[1],
                         chr_range[0]:chr_range[1]] = lil_matrix(
                             __obs_exp(submatrix, length_chromosome,
                                       chromosome_count))

    elif args.method == 'pearson':
        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            log.debug("shape: {}".format(submatrix.shape))

            submatrix.astype(float)
            log.debug("shape: {}".format(submatrix.shape))

            trasf_matrix[chr_range[0]:chr_range[1],
                         chr_range[0]:chr_range[1]] = lil_matrix(
                             __pearson(submatrix.todense()))

    elif args.method == 'covariance':
        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            log.debug("shape: {}".format(submatrix.shape))

            submatrix.astype(float)
            log.debug("shape: {}".format(submatrix.shape))

            corrmatrix = np.cov(submatrix.todense())
            trasf_matrix[chr_range[0]:chr_range[1],
                         chr_range[0]:chr_range[1]] = lil_matrix(corrmatrix)

    elif args.method == 'all':
        trasf_matrix_obs_exp = lil_matrix(hic_ma.matrix.shape)
        trasf_matrix_pearson = lil_matrix(hic_ma.matrix.shape)
        trasf_matrix_corr = lil_matrix(hic_ma.matrix.shape)

        for chrname in hic_ma.getChrNames():
            chr_range = hic_ma.getChrBinRange(chrname)
            submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]

            submatrix.astype(float)
            submatrix = __obs_exp(submatrix, length_chromosome,
                                  chromosome_count)

            trasf_matrix_obs_exp[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     submatrix)
            submatrix = __pearson(submatrix)

            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     submatrix)
            corrmatrix = np.cov(submatrix)
            trasf_matrix_corr[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]] = lil_matrix(
                                  corrmatrix)

        hic_ma.setMatrix(trasf_matrix_obs_exp.tocsr(),
                         cut_intervals=hic_ma.cut_intervals)

        basename_outFileName = basename(args.outFileName)
        basename_obs_exp = "obs_exp_" + basename_outFileName
        basename_pearson = "pearson_" + basename_outFileName
        basename_covariance = "covariance_" + basename_outFileName
        path = dirname(args.outFileName)
        if path != '':
            path += '/'

        hic_ma.save(path + basename_obs_exp,
                    pSymmetric=False,
                    pApplyCorrection=False)

        hic_ma.setMatrix(trasf_matrix_pearson.tocsr(),
                         cut_intervals=hic_ma.cut_intervals)
        hic_ma.save(path + basename_pearson,
                    pSymmetric=False,
                    pApplyCorrection=False)

        hic_ma.setMatrix(trasf_matrix_corr.tocsr(),
                         cut_intervals=hic_ma.cut_intervals)
        hic_ma.save(path + basename_covariance,
                    pSymmetric=False,
                    pApplyCorrection=False)

    if not args.method == 'all':
        hic_ma.setMatrix(trasf_matrix.tocsr(),
                         cut_intervals=hic_ma.cut_intervals)
        hic_ma.save(args.outFileName, pSymmetric=False, pApplyCorrection=False)
Example #15
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(
            args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
    ma.maskBins(np.flatnonzero(row_sum == 0))
    matrix_shape = ma.matrix.shape
    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    outlier_regions = filter_by_zscore(ma,
                                       args.filterThreshold[0],
                                       args.filterThreshold[1],
                                       perchr=args.perchr)
    # compute and print some statistics
    pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
    ma.printchrtoremove(outlier_regions,
                        label="Bins that are MAD outliers ({:.2f}%) "
                        "out of".format(pct_outlier, ma.matrix.shape[0]),
                        restore_masked_bins=False)

    assert matrix_shape == ma.matrix.shape
    # mask filtered regions
    ma.maskBins(outlier_regions)
    total_filtered_out = set(outlier_regions)

    if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
        chrom, _, _, coverage = zip(*ma.cut_intervals)

        assert type(coverage[0]) == np.float64

        failed_bins = np.flatnonzero(
            np.array(coverage) < args.sequencedCountCutoff)

        ma.printchrtoremove(failed_bins,
                            label="Bins with low coverage",
                            restore_masked_bins=False)
        ma.maskBins(failed_bins)
        total_filtered_out = set(failed_bins)
        """
        ma.matrix, to_remove = fill_gaps(ma, failed_bins)
        log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
        ma.maskBins(to_remove)
        """

    if args.transCutoff and 0 < args.transCutoff < 100:
        cutoff = float(args.transCutoff) / 100
        # a usual cutoff is 0.05
        ma.truncTrans(high=cutoff)

    pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    correction_factors = []
    if args.perchr:
        corrected_matrix = lil_matrix(ma.matrix.shape)
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            _matrix, _corr_factors = iterative_correction(chr_submatrix, args)
            corrected_matrix[chr_range[0]:chr_range[1],
                             chr_range[0]:chr_range[1]] = _matrix
            correction_factors.append(_corr_factors)
        correction_factors = np.concatenate(correction_factors)

    else:
        corrected_matrix, correction_factors = iterative_correction(
            ma.matrix, args)

    ma.setMatrixValues(corrected_matrix)
    ma.setCorrectionFactors(correction_factors)
    log.info("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0:
        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(
            after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff),
                            restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)

        ma.maskBins(to_remove)

    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed",
                        restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)
Example #16
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error("Number of output file names and number of eigenvectors"
                  " does not match. Please"
                  "provide the name of each file.\nFiles: {}\nNumber of "
                  "eigenvectors: {}".format(args.outputFileName,
                                            args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        trasf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        trasf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    if args.extraTrack and (args.extraTrack.endswith('.bw')
                            or args.extraTrack.endswith('.bigwig')):
        bwTrack = pyBigWig.open(args.extraTrack, 'r')
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.norm:
            obs_exp_matrix_ = obs_exp_matrix_norm(submatrix)

        else:
            obs_exp_matrix_ = obs_exp_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
        obs_exp_matrix_ = convertNansToZeros(
            csr_matrix(obs_exp_matrix_)).todense()
        obs_exp_matrix_ = convertInfsToZeros(
            csr_matrix(obs_exp_matrix_)).todense()

        if args.obsexpMatrix:
            trasf_matrix_obsexp[chr_range[0]:chr_range[1],
                                chr_range[0]:chr_range[1]] = lil_matrix(
                                    obs_exp_matrix_)

        pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])

        chrom_list += chrom
        start_list += start
        end_list += end
        if args.extraTrack and (args.extraTrack.endswith('.bw')
                                or args.extraTrack.endswith('.bigwig')):
            assert (len(end) == len(start))
            correlateEigenvectorWithHistonMarkTrack(eigs[:, :k].transpose(),
                                                    bwTrack, chrname, start,
                                                    end, args.extraTrack,
                                                    args.histonMarkType)

        vecs_list += eigs[:, :k].tolist()

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.extraTrack and not args.extraTrack.endswith(
            '.bw') and not args.extraTrack.endswith('.bigwig'):
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list,
                                                      args.extraTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error("ERROR: Your version of pyBigWig is not supporting "
                      "numpy: {}".format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list,
                          _start_list,
                          ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
Example #17
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if args.labels and len(args.matrices) != len(args.labels):
        log.error("The number of labels does not match the number of matrices.")
        exit(0)
    if not args.labels:
        args.labels = map(lambda x: os.path.basename(x), args.matrices)

    num_files = len(args.matrices)
    map(lambda x: os.path.basename(x), args.matrices)
    # initialize results matrix
    results = np.zeros((num_files, num_files), dtype='float')

    rows, cols = np.triu_indices(num_files)
    correlation_opts = {'spearman': spearmanr,
                        'pearson': pearsonr}
    hic_mat_list = []
    max_value = None
    min_value = None
    all_mat = None
    all_nan = []

    for i, matrix in enumerate(args.matrices):
        log.info("loading hic matrix {}\n".format(matrix))

        if (check_cooler(args.matrices[i])) and args.chromosomes is not None and len(args.chromosomes) == 1:
            _mat = hm.hiCMatrix(matrix, pChrnameList=args.chromosomes)
        else:
            _mat = hm.hiCMatrix(matrix)
            if args.chromosomes:
                _mat.keepOnlyTheseChr(args.chromosomes)
            _mat.filterOutInterChrCounts()

        _mat.diagflat(0)
        log.info("restore masked bins {}\n".format(matrix))
        bin_size = _mat.getBinSize()
        all_nan = np.unique(np.concatenate([all_nan, _mat.nan_bins]))

        _mat = triu(_mat.matrix, k=0, format='csr')
        if args.range:
            min_dist, max_dist = args.range.split(":")
            min_dist = int(min_dist)
            max_dist = int(max_dist)
            if max_dist < bin_size:
                log.error("Please specify a max range that is larger than bin size ({})".format(bin_size))
                exit()
            max_depth_in_bins = int(max_dist / bin_size)
            max_dist = int(max_dist) // bin_size
            min_dist = int(min_dist) // bin_size
            # work only with the upper matrix
            # and remove all pixels that are beyond
            # max_depth_in_bis
            # (this is done by subtracting a second sparse matrix
            # that contains only the upper matrix that wants to be removed.
            _mat = triu(_mat, k=0, format='csr') - triu(_mat, k=max_depth_in_bins, format='csr')

            _mat.eliminate_zeros()

            _mat_coo = _mat.tocoo()
            dist = _mat_coo.col - _mat_coo.row
            keep = np.flatnonzero((dist <= max_dist) & (dist >= min_dist))
            _mat_coo.data = _mat_coo.data[keep]
            _mat_coo.row = _mat_coo.row[keep]
            _mat_coo.col = _mat_coo.col[keep]
            _mat = _mat_coo.tocsr()
        else:
            _mat = triu(_mat, k=0, format='csr')

        if args.log1p:
            _mat.data = np.log1p(_mat.data)
        if all_mat is None:
            all_mat = _mat
        else:
            all_mat = all_mat + _mat

        if max_value is None or max_value < _mat.data.max():
            max_value = _mat.data.max()
        if min_value is None or min_value > _mat.data.min():
            min_value = _mat.data.min()

        hic_mat_list.append(_mat)

    # remove nan bins
    rows_keep = cols_keep = np.delete(list(range(all_mat.shape[1])), all_nan)
    all_mat = all_mat[rows_keep, :][:, cols_keep]

    # make large matrix to correlate by
    # using sparse matrix tricks

    big_mat = None
    for mat in hic_mat_list:
        mat = mat[rows_keep, :][:, cols_keep]
        sample_vector = (mat + all_mat).data - all_mat.data
        if big_mat is None:
            big_mat = sample_vector
        else:
            big_mat = np.vstack([big_mat, sample_vector])

    # take the transpose such that columns represent each of the samples
    big_mat = np.ma.masked_invalid(big_mat).T

    grids = gridspec.GridSpec(num_files, num_files)
    grids.update(wspace=0, hspace=0)
    fig = plt.figure(figsize=(2 * num_files, 2 * num_files))
    plt.rcParams['font.size'] = 8.0

    min_value = int(big_mat.min())
    max_value = int(big_mat.max())
    if (min_value % 2 == 0 and max_value % 2 == 0) or \
            (min_value % 1 == 0 and max_value % 2 == 1):
        # make one value odd and the other even
        max_value += 1

    if args.log1p:
        major_locator = FixedLocator(list(range(min_value, max_value, 2)))
        minor_locator = FixedLocator(list(range(min_value, max_value, 1)))

    for index in range(len(rows)):
        row = rows[index]
        col = cols[index]
        if row == col:
            results[row, col] = 1

            # add titles as
            # empty plot in the diagonal
            ax = fig.add_subplot(grids[row, col])
            ax.text(0.6, 0.6, args.labels[row],
                    verticalalignment='center',
                    horizontalalignment='center',
                    fontsize=10, fontweight='bold',
                    transform=ax.transAxes)
            ax.set_axis_off()
            continue

        log.info("comparing {} and {}\n".format(args.matrices[row],
                                                args.matrices[col]))

        # remove cases in which both are zero or one is zero and
        # the other is one
        _mat = big_mat[:, [row, col]]
        _mat = _mat[_mat.sum(axis=1) > 1, :]
        vector1 = _mat[:, 0]
        vector2 = _mat[:, 1]

        results[row, col] = correlation_opts[args.method](vector1, vector2)[0]

        # scatter plots
        ax = fig.add_subplot(grids[row, col])
        if args.log1p:
            ax.xaxis.set_major_locator(major_locator)
            ax.xaxis.set_minor_locator(minor_locator)
            ax.yaxis.set_major_locator(major_locator)
            ax.yaxis.set_minor_locator(minor_locator)

        ax.text(0.2, 0.8, "{}={:.2f}".format(args.method,
                                             results[row, col]),
                horizontalalignment='left',
                transform=ax.transAxes)
        ax.get_yaxis().set_tick_params(
            which='both',
            left='off',
            right='off',
            direction='out')

        ax.get_xaxis().set_tick_params(
            which='both',
            top='off',
            bottom='off',
            direction='out')

        if col != num_files - 1:
            ax.set_yticklabels([])
        else:
            ax.yaxis.tick_right()
            ax.get_yaxis().set_tick_params(
                which='both',
                left='off',
                right='on',
                direction='out')
        if col - row == 1:
            ax.xaxis.tick_bottom()
            ax.get_xaxis().set_tick_params(
                which='both',
                top='off',
                bottom='on',
                direction='out')
        else:
            ax.set_xticklabels([])

        ax.hist2d(vector1, vector2, bins=150, cmin=0.1)
    fig.tight_layout()
    log.info("saving {}".format(args.outFileNameScatter))
    fig.savefig(args.outFileNameScatter, bbox_inches='tight')

    results = results + np.triu(results, 1).T
    plot_correlation(results, args.labels,
                     args.outFileNameHeatmap,
                     args.zMax,
                     args.zMin,
                     args.colorMap,
                     image_format=args.plotFileFormat)
Example #18
0
    def __init__(self, *args, **kwargs):
        super(EngineHiCTrack, self).__init__(*args, **kwargs)

        log.debug('FILE {}'.format(self.properties))
        # log.debug('pRegion {}'.format(pRegion))
        region = None
        if self.properties['region'] is not None:
            if self.properties['region'][2] == 1e15:
                region = [str(self.properties['region'][0])]
            elif len(self.properties['region']) == 3:
                start = int(self.properties['region'][1]) - int(
                    self.properties['depth'])
                if start < 0:
                    start = 0
                end = int(self.properties['region'][2]) + int(
                    self.properties['depth'])

                region = [
                    str(self.properties['region'][0]) + ':' + str(start) +
                    '-' + str(end)
                ]

        # initialize matrix as HiCMatrix object with no data
        self.hic_ma = HiCMatrix.hiCMatrix(pMatrixFile=None,
                                          pChrnameList=region)
        # create matrix to fill out data and intervals
        if 'matrix shape' not in self.properties:
            self.properties['matrix shape'] = 1000
        if 'binsize' not in self.properties:
            self.properties['binsize'] = 3000
        if 'intervals start' not in self.properties:
            self.properties['intervals start'] = 0

        self.hic_ma.matrix, self.hic_ma.cut_intervals = \
            self.definematrix(self.properties['matrix shape'], self.properties['binsize'], self.properties['intervals start'], self.properties['chrom'])

        self.hic_ma.interval_trees, self.hic_ma.chrBinBoundaries = \
            self.hic_ma.intervalListToIntervalTree(self.hic_ma.cut_intervals)

        if len(self.hic_ma.matrix.data) == 0:
            self.log.error("Matrix {} is empty".format(
                self.properties['file']))
            exit(1)
        if 'show_masked_bins' in self.properties and self.properties[
                'show_masked_bins'] == 'yes':
            pass
        else:
            self.hic_ma.maskBins(self.hic_ma.nan_bins)

        # check that the matrix can be log transformed
        if 'transform' in self.properties:
            if self.properties['transform'] == 'log1p':
                if self.hic_ma.matrix.data.min() + 1 < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log1p transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

            elif self.properties['transform'] == '-log':
                if self.hic_ma.matrix.data.min() < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log(-1 * <values>) transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

            elif self.properties['transform'] == 'log':
                if self.hic_ma.matrix.data.min() < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

        binsize = self.hic_ma.getBinSize()
        max_depth_in_bins = int(self.properties['depth'] / binsize)

        # work only with the lower matrix
        # and remove all pixels that are beyond
        # 2 * max_depth_in_bis which are not required
        # (this is done by subtracting a second sparse matrix
        # that contains only the lower matrix that wants to be removed.
        limit = 2 * max_depth_in_bins
        self.hic_ma.matrix = scipy.sparse.triu(self.hic_ma.matrix, k=0, format='csr') - \
            scipy.sparse.triu(self.hic_ma.matrix, k=limit, format='csr')
        self.hic_ma.matrix.eliminate_zeros()

        # fill the main diagonal, otherwise it looks
        # not so good. The main diagonal is filled
        # with an array containing the max value found
        # in the matrix
        if sum(self.hic_ma.matrix.diagonal()) == 0:
            self.log.info(
                "Filling main diagonal with max value because it empty and looks bad...\n"
            )
            max_value = self.hic_ma.matrix.data.max()
            main_diagonal = scipy.sparse.dia_matrix(
                ([max_value] * self.hic_ma.matrix.shape[0], [0]),
                shape=self.hic_ma.matrix.shape)
            self.hic_ma.matrix = self.hic_ma.matrix + main_diagonal

        self.plot_inverted = False
        if 'orientation' in self.properties and self.properties[
                'orientation'] == 'inverted':
            self.plot_inverted = True

        self.norm = None

        if 'colormap' not in self.properties:
            self.properties['colormap'] = DEFAULT_MATRIX_COLORMAP
        self.cmap = cm.get_cmap(self.properties['colormap'])
        self.cmap.set_bad('white')
        #self.cmap.set_over('blue')
        self.background = True
Example #19
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    viewpointObj = Viewpoint()
    referencePoints, _ = viewpointObj.readReferencePointFile(
        args.referencePoints)

    # compute for each viewpoint the sparsity and consider these as bad with a sparsity less than given.

    referencePointsPerThread = len(referencePoints) // args.threads
    queue = [None] * args.threads
    process = [None] * args.threads
    sparsity = []
    fail_flag = False
    fail_message = ''
    for j, matrix in enumerate(args.matrices):
        sparsity_local = [None] * args.threads
        hic_ma = hm.hiCMatrix(matrix)
        viewpointObj.hicMatrix = hic_ma

        all_data_collected = False
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                referencePointsThread = referencePoints[
                    i * referencePointsPerThread:(i + 1) *
                    referencePointsPerThread]
            else:
                referencePointsThread = referencePoints[
                    i * referencePointsPerThread:]
            if len(referencePointsThread) == 0:
                process[i] = None
                queue[i] = None
                sparsity_local[i] = []
                continue
            else:
                queue[i] = Queue()
                process[i] = Process(
                    target=compute_sparsity,
                    kwargs=dict(pReferencePoints=referencePointsThread,
                                pViewpointObj=viewpointObj,
                                pArgs=args,
                                pQueue=queue[i]))

                process[i].start()
                log.debug('process started {}'.format(i))

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    sparsity_ = queue[i].get()
                    if 'Fail:' in sparsity_:
                        fail_flag = True
                        fail_message = sparsity_[6:]
                    log.debug('process computed: {}'.format(i))
                    sparsity_local[i] = sparsity_
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        del hic_ma
        del viewpointObj.hicMatrix

        # merge sparsity data per matrix from each thread to one list
        if fail_flag:
            log.error(fail_message)
            exit(1)
        sparsity_local = [
            item for sublist in sparsity_local for item in sublist
        ]
        sparsity.append(sparsity_local)

    # sparsity = np.array(sparsity)
    # mask = sparsity == -1.0

    # change sparsity to sparsity values per viewpoint per matrix: viewpoint = [matrix1, ..., matrix_n]
    sparsity = np.array(sparsity).T
    count_accepted = 0
    count_rejected = 0
    count_failure = 0
    with open(args.referencePoints, 'r') as reference_file_input:
        with open(args.outFileName + '_raw_filter', 'w') as output_file_raw:
            output_file_raw.write(
                '# Created with chicQualityControl version {}\n'.format(
                    __version__))
            output_file_raw.write(
                '# A sparsity of -1.0 indicates a faulty reference point e.g. no data for this reference point was in the matrix.\n'
            )
            output_file_raw.write('# Used Matrices ')
            for matrix in args.matrices:
                output_file_raw.write('{}\t'.format(matrix))
            output_file_raw.write('\n# Chromosome\tStart\tEnd')
            for matrix in args.matrices:
                output_file_raw.write('\tSparsity {}'.format(
                    os.path.basename(matrix)))
            output_file_raw.write('\n')

            with open(args.outFileName + '_failed_reference_points',
                      'w') as output_file_failed:
                with open(args.outFileName + '_rejected_filter',
                          'w') as output_file_rejected:
                    with open(args.outFileName, 'w') as output_file:
                        for i, line in enumerate(
                                reference_file_input.readlines()):
                            sparsity_str = '\t'.join(
                                str(x) for x in sparsity[i])
                            output_file_raw.write(line.strip() + '\t' +
                                                  sparsity_str + '\n')
                            count = 0
                            count_negative = 0
                            for j in range(len(sparsity[i])):
                                if sparsity[i][j] == -1.0:
                                    count_negative += 1
                                elif sparsity[i][j] > args.sparsity:
                                    count += 1
                            if count_negative:
                                output_file_failed.write(line)
                                count_failure += 1
                            elif count:
                                output_file.write(line)
                                count_accepted += 1
                            else:
                                output_file_rejected.write(line)
                                count_rejected += 1

    with open(args.outFileName + '_report', 'w') as output_file_report:
        output_file_report.write(
            '# Created with chicQualityControl version {}\n'.format(
                __version__))
        output_file_report.write('# QC report for matrices: ')
        for matrix in args.matrices:
            output_file_report.write(matrix + ' ')
        output_file_report.write('\n')
        output_file_report.write(
            '#Sparsity threshold for rejection: sparsity <= {} are rejected.\n'
            .format(args.sparsity))
        output_file_report.write('\nNumber of reference points: {}\n'.format(
            str(count_accepted + count_rejected + count_failure)))
        output_file_report.write(
            'Number of accepted reference points: {}\n'.format(
                str(count_accepted)))
        output_file_report.write(
            'Number of rejected reference points: {}\n'.format(
                str(count_rejected)))
        output_file_report.write(
            'Number of faulty reference points: {}\n'.format(
                str(count_failure)))
        output_file_report.write(
            '\n\nA faulty reference point is caused by the non-presence of the chromosome in one of the given matrices.\n'
        )
        output_file_report.write(
            'It can also be caused by the non-presence of valid Hi-C reads in a region, especially at the chromosome ends.\n'
        )
        output_file_report.write(
            'Please check the results of hicInfo to validate this for your data.\n'
        )

    # output plot of sparsity distribution per sample
    # remove fault reference points from statistics
    x = [[]] * len(args.matrices)
    y = [[]] * len(args.matrices)

    mask = [True] * len(sparsity)
    for i in range(len(sparsity)):
        delete_instance = False
        for j in range(len(args.matrices)):
            if sparsity[i][j] == -1.0:
                delete_instance = True
        if delete_instance:
            mask[i] = False

    mask = np.array(mask)
    sparsity = sparsity[mask]

    for i in range(len(args.matrices)):
        y[i] = [i] * len(sparsity)

    sparsity = sparsity.T

    for i in range(len(args.matrices)):
        x[i] = sparsity[i].flatten()

    for i in range(len(args.matrices)):
        plt.plot(x[i],
                 y[i],
                 'o',
                 mfc='none',
                 markersize=0.3,
                 label=args.matrices[i].split('/')[-1])
    plt.yticks([])
    plt.xlabel("Sparsity level")

    plt.axvline(x=args.sparsity,
                c='r',
                label='sparsity threshold',
                linewidth=0.3)
    plt.xscale('log')
    ax = plt.gca()
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5))
    plt.savefig(args.outFileNameSparsity, dpi=args.dpi)

    # plt.xlabel("Length of list (number)")
    # plt.ylabel("Time taken (seconds)")
    plt.close()
    for i in range(len(args.matrices)):
        plt.hist(x[i],
                 bins=100,
                 alpha=0.5,
                 label=args.matrices[i].split('/')[-1])
    plt.xlabel("Sparsity level")
    plt.ylabel("Number of counts")
    # plt.legend(loc='upper right')

    ax = plt.gca()
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5))
    plt.savefig(args.outFileNameHistogram, dpi=args.dpi)
def main(args=None):
    args = parse_arguments().parse_args(args)

    viewpointObj = Viewpoint()
    referencePoints, _ = viewpointObj.readReferencePointFile(
        args.referencePoints)

    # compute for each viewpoint the sparsity and consider these as bad with a sparsity less than given.

    referencePointsPerThread = len(referencePoints) // args.threads
    queue = [None] * args.threads
    process = [None] * args.threads
    sparsity = []

    for j, matrix in enumerate(args.matrices):
        sparsity_local = [None] * args.threads
        hic_ma = hm.hiCMatrix(matrix)
        viewpointObj.hicMatrix = hic_ma

        all_data_collected = False
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                referencePointsThread = referencePoints[
                    i * referencePointsPerThread:(i + 1) *
                    referencePointsPerThread]
            else:
                referencePointsThread = referencePoints[
                    i * referencePointsPerThread:]
            if len(referencePointsThread) == 0:
                process[i] = None
                queue[i] = None
                sparsity_local[i] = []
                continue
            else:
                queue[i] = Queue()
                process[i] = Process(
                    target=compute_sparsity,
                    kwargs=dict(pReferencePoints=referencePointsThread,
                                pViewpointObj=viewpointObj,
                                pArgs=args,
                                pQueue=queue[i]))

                process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    sparsity_ = queue[i].get()
                    sparsity_local[i] = sparsity_
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        del hic_ma
        del viewpointObj.hicMatrix

        # merge sparsity data per matrix from each thread to one list

        sparsity_local = [
            item for sublist in sparsity_local for item in sublist
        ]
        sparsity.append(sparsity_local)

    # change sparsity to sparsity values per viewpoint per matrix: viewpoint = [matrix1, ..., matrix_n]
    sparsity = np.array(sparsity).T

    with open(args.referencePoints, 'r') as reference_file_input:
        with open(args.outFileName + '_raw_filter', 'w') as output_file_raw:
            output_file_raw.write(
                '# Created with chicQualityControl version {}\n'.format(
                    __version__))
            output_file_raw.write('# Chromosome\tStart\tEnd\t')
            for matrix in args.matrices:
                output_file_raw.write('Sparsity {}\t'.format(matrix))
            output_file_raw.write('\n')

            with open(args.outFileName + '_rejected_filter',
                      'w') as output_file_rejected:
                with open(args.outFileName, 'w') as output_file:
                    for i, line in enumerate(reference_file_input.readlines()):
                        sparsity_str = '\t'.join(str(x) for x in sparsity[i])
                        output_file_raw.write(line.strip() + '\t' +
                                              sparsity_str + '\n')
                        count = 0
                        for j in range(len(sparsity[i])):
                            if sparsity[i][j] > args.sparsity:
                                count += 1
                        if count:
                            output_file.write(line)
                        else:
                            output_file_rejected.write(line)
    # output plot of sparsity distribution per sample

    # re-arange values again

    x = [[]] * len(args.matrices)
    y = [[]] * len(args.matrices)

    for i in range(len(args.matrices)):
        y[i] = [i] * len(sparsity)
    sparsity = sparsity.T

    for i in range(len(args.matrices)):
        x[i] = sparsity[i].flatten()

    for i in range(len(args.matrices)):
        plt.plot(x[i],
                 y[i],
                 'o',
                 mfc='none',
                 markersize=0.3,
                 label=args.matrices[i].split('/')[-1])
    plt.yticks([])
    plt.xlabel("Sparsity level")

    plt.axvline(x=args.sparsity,
                c='r',
                label='sparsity threshold',
                linewidth=0.3)
    plt.xscale('log')
    ax = plt.gca()
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5))
    plt.savefig(args.outFileNameSparsity, dpi=args.dpi)

    # plt.xlabel("Length of list (number)")
    # plt.ylabel("Time taken (seconds)")
    plt.close()
    for i in range(len(args.matrices)):
        plt.hist(x[i],
                 bins=100,
                 alpha=0.5,
                 label=args.matrices[i].split('/')[-1])
    plt.xlabel("Sparsity level")
    plt.ylabel("Number of counts")
    # plt.legend(loc='upper right')

    ax = plt.gca()
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5))
    plt.savefig(args.outFileNameHistogram, dpi=args.dpi)
Example #21
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    viewpointObj = Viewpoint()

    referencePoints, gene_list = viewpointObj.readReferencePointFile(
        args.referencePoints)
    referencePointsPerThread = len(referencePoints) // args.threads
    queue = [None] * args.threads
    process = [None] * args.threads
    file_list = []
    background_model = viewpointObj.readBackgroundDataFile(
        args.backgroundModelFile, args.range, args.fixateRange)
    background_model_mean_values = viewpointObj.readBackgroundDataFile(
        args.backgroundModelFile, args.range, args.fixateRange, pMean=True)
    # background_sum_of_densities_dict = viewpointObj.computeSumOfDensities(
    #     background_model, args, pXfoldMaxValue=args.xFoldMaxValueNB)

    if not os.path.exists(args.outputFolder):
        try:
            os.makedirs(args.outputFolder)
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    fail_flag = False
    fail_message = ''

    for matrix in args.matrices:
        hic_ma = hm.hiCMatrix(matrix)
        viewpointObj.hicMatrix = hic_ma
        file_list_sample = [None] * args.threads
        all_data_collected = False

        for i in range(args.threads):

            if i < args.threads - 1:
                referencePointsThread = referencePoints[i * referencePointsPerThread:(i + 1) * referencePointsPerThread]
                geneListThread = gene_list[i * referencePointsPerThread:(i + 1) * referencePointsPerThread]
            else:
                referencePointsThread = referencePoints[i * referencePointsPerThread:]
                geneListThread = gene_list[i * referencePointsPerThread:]

            if len(referencePointsThread) == 0:
                process[i] = None
                queue[i] = None
                file_list_sample[i] = []
                continue
            queue[i] = Queue()
            process[i] = Process(target=compute_viewpoint, kwargs=dict(
                pViewpointObj=viewpointObj,
                pArgs=args,
                pQueue=queue[i],
                pReferencePoints=referencePointsThread,
                pGeneList=geneListThread,
                pMatrix=matrix,
                pBackgroundModel=background_model,
                pBackgroundModelRelativeInteractions=background_model_mean_values,
                pOutputFolder=args.outputFolder
            )
            )

            process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    file_list_ = queue[i].get()
                    if 'Fail:' in file_list_:
                        fail_flag = True
                        fail_message = file_list_[6:]
                    file_list_sample[i] = file_list_
                    process[i].join()
                    process[i].terminate()
                    process[i] = None

            all_data_collected = True

            for i in range(args.threads):
                if process[i] is not None:
                    all_data_collected = False
            time.sleep(1)

        if fail_flag:
            log.error(fail_message)
            exit(1)

        file_list_sample = [item for sublist in file_list_sample for item in sublist]
        file_list.append(file_list_sample)

    log.debug('file_list {}'.format(file_list))
    if args.writeFileNamesToFile:
        with open(args.writeFileNamesToFile, 'w') as file:
            log.debug('len(file_list) {}'.format(len(file_list)))
            if len(file_list) > 1:
                for i, sample in enumerate(file_list):
                    for sample2 in file_list[i + 1:]:
                        for viewpoint, viewpoint2 in zip(sample, sample2):
                            file.write(viewpoint + '\n')
                            file.write(viewpoint2 + '\n')
            else:
                for viewpoint in file_list[0]:
                    file.write(viewpoint + '\n')
    if args.allViewpointsList:

        with open(args.writeFileNamesToFile + 'all', 'w') as file:
            if len(file_list) > 1:
                for i, sample in enumerate(file_list[0]):
                    file.write(sample + '\n')
                    for j in range(1, len(file_list)):
                        file.write(file_list[j][i] + '\n')
            else:
                for viewpoint in file_list[0]:
                    file.write(viewpoint + '\n')
Example #22
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if args.chromosomes is not None and args.regions is not None:
        log.error('Please specify either --chromosomes or --regions.')
        exit(1)
    hic_ma = None
    if args.chromosomes:

        if check_cooler(args.matrix) and len(
                args.chromosomes) == 1 and args.action == 'keep':
            hic_ma = hm.hiCMatrix(args.matrix, pChrnameList=args.chromosomes)
        else:
            hic_ma = hm.hiCMatrix(args.matrix)

        if args.action == 'keep':
            hic_ma.reorderChromosomes(args.chromosomes)
        elif args.action == 'remove':
            chromosomes = list(hic_ma.chrBinBoundaries)
            for chromosome in args.chromosomes:
                if chromosome in chromosomes:
                    chromosomes.remove(chromosome)
            hic_ma.reorderChromosomes(chromosomes)
        elif args.action == 'mask':
            hic_ma.maskChromosomes(args.chromosomes)
    elif args.regions:
        hic_ma = hm.hiCMatrix(args.matrix)
        genomic_regions = []
        with open(args.regions, 'r') as file:
            for line in file.readlines():
                _line = line.strip().split('\t')
                if len(line) == 0:
                    continue
                if len(_line) == 3:
                    chrom, start, end = _line[0], _line[1], int(_line[2]) - 1

                genomic_regions.append((chrom, start, end))

        # log.debug('genomic_regions {}'.format(genomic_regions))
        matrix_indices_regions = []
        for region in genomic_regions:
            _regionBinRange = hic_ma.getRegionBinRange(region[0], region[1],
                                                       region[2])
            if _regionBinRange is not None:
                start, end = _regionBinRange
                matrix_indices_regions.extend(list(range(start, end)))

        # log.debug('matrix_indices_regions {}'.format(matrix_indices_regions))
        if args.action == 'keep':
            hic_ma.reorderBins(matrix_indices_regions)
        elif args.action == 'mask':
            hic_ma.maskBins(matrix_indices_regions)

        elif args.action == 'remove':

            full_matrix_range = np.array(
                range(0, max(hic_ma.matrix.shape[0], hic_ma.matrix.shape[1])))
            matrix_indices_regions = np.array(matrix_indices_regions)
            full_matrix_range[matrix_indices_regions] = -1
            mask = full_matrix_range != -1
            full_matrix_range = full_matrix_range[mask]

            hic_ma.reorderBins(full_matrix_range)
    elif args.maskBadRegions:
        if check_cooler(args.matrix) and len(
                args.chromosomes) == 1 and args.action == 'keep':
            hic_ma = hm.hiCMatrix(args.matrix, pChrnameList=args.chromosomes)
        else:
            hic_ma = hm.hiCMatrix(args.matrix)

    else:
        log.info(
            'No data to adjust given. Please specify either --chromosomes or --region parameter.'
        )

    if hic_ma is not None:
        hic_ma.save(args.outFileName)
Example #23
0
def main(args=None):

    args = parse_arguments().parse_args(args)
    for matrix in args.matrices:
        # if
        generated_by = None
        genome_assembly = None
        statistics = None
        generated_by_cooler_lib = None
        tool_url = None
        matrix_generated_by = None
        matrix_generated_by_url = None
        creation_date = None
        chromosomes = None
        bin_length = None
        size = None
        nchroms = None
        num_non_zero = None
        min_non_zero = None
        max_non_zero = None
        sum_elements = None
        num_nan_bins = None

        if check_cooler(matrix) and args.no_metadata:
            cooler_file = cooler.Cooler(matrix)

            if cooler_file.info is not None:
                # log.debug('cooler_file.info {}'.format(cooler_file.info))
                if 'bin-size' in cooler_file.info:
                    bin_length = cooler_file.info['bin-size']
                if 'nbins' in cooler_file.info:
                    size = cooler_file.info['nbins']
                if 'nchroms' in cooler_file.info:
                    nchroms = cooler_file.info['nchroms']
                if 'chromosomes' in cooler_file.info:
                    chromosomes = cooler_file.info['chromosomes']
                if 'nnz' in cooler_file.info:
                    num_non_zero = cooler_file.info['nnz']
                if 'min-value' in cooler_file.info:
                    min_non_zero = cooler_file.info['min-value']
                if 'max-value' in cooler_file.info:
                    max_non_zero = cooler_file.info['max-value']
                if 'generated-by' in cooler_file.info:
                    generated_by = toString(cooler_file.info['generated-by'])
                if 'genome-assembly' in cooler_file.info:
                    genome_assembly = toString(
                        cooler_file.info['genome-assembly'])
                if 'metadata' in cooler_file.info:
                    if cooler_file.info['metadata'] is not None:
                        if 'statistics' in cooler_file.info['metadata']:
                            statistics = cooler_file.info['metadata'][
                                'statistics']
                if 'generated-by-cooler-lib' in cooler_file.info:
                    generated_by_cooler_lib = toString(
                        cooler_file.info['generated-by-cooler-lib'])
                if 'tool-url' in cooler_file.info:
                    tool_url = toString(cooler_file.info['tool-url'])
                if 'matrix-generated-by' in cooler_file.info:
                    matrix_generated_by = toString(
                        cooler_file.info['matrix-generated-by'])
                if 'matrix-generated-by-url' in cooler_file.info:
                    matrix_generated_by_url = toString(
                        cooler_file.info['matrix-generated-by-url'])
                if 'creation-date' in cooler_file.info:
                    creation_date = cooler_file.info['creation-date']
                if 'sum-elements' in cooler_file.info:
                    sum_elements = cooler_file.info['sum-elements']

        else:
            hic_ma = hm.hiCMatrix(matrix)
            size = hic_ma.matrix.shape[0]
            num_non_zero = hic_ma.matrix.nnz
            sum_elements = hic_ma.matrix.sum() / 2
            bin_length = hic_ma.getBinSize()
            num_nan_bins = len(hic_ma.nan_bins)
            min_non_zero = hic_ma.matrix.data.min()
            max_non_zero = hic_ma.matrix.data.max()

            chromosomes = list(hic_ma.chrBinBoundaries)

        information = StringIO()
        information.write(
            "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n"
            .format(__version__))

        if matrix is not None:
            information.write("File:\t{}\n".format(matrix))
        if creation_date is not None:
            information.write("Date:\t{}\n".format(creation_date))

        if genome_assembly is not None:
            information.write("Genome assembly:\t{}\n".format(genome_assembly))
        if size is not None:
            information.write("Size:\t{:,}\n".format(size))
        if bin_length is not None:
            information.write("Bin_length:\t{}\n".format(bin_length))
        if sum_elements is not None:
            information.write("Sum of matrix:\t{}\n".format(sum_elements))
        if chromosomes is not None:
            information.write("Chromosomes:\t{}\n".format(", ".join(
                toString(chromosomes))))
        if nchroms is not None:
            information.write("Number of chromosomes:\t{}\n".format(nchroms))
        if num_non_zero is not None:
            information.write(
                "Non-zero elements:\t{:,}\n".format(num_non_zero))
        if min_non_zero is not None:
            information.write("Minimum (non zero):\t{}\n".format(min_non_zero))
        if max_non_zero is not None:
            information.write("Maximum:\t{}\n".format(max_non_zero))
        if num_nan_bins is not None:
            information.write("NaN bins:\t{}\n".format(num_nan_bins))

        if check_cooler(matrix):
            information.write(
                'The following columns are available: {}\n'.format(
                    cooler.Cooler(matrix).bins().columns.values))
        if generated_by is not None:
            information.write("\n\nGenerated by:\t{}\n".format(generated_by))

        if generated_by_cooler_lib is not None:
            information.write("Cooler library version:\t{}\n".format(
                generated_by_cooler_lib))
        if tool_url is not None:
            information.write("HiCMatrix url:\t{}\n".format(tool_url))
        if matrix_generated_by is not None:
            information.write("Interaction matrix created with:\t{}\n".format(
                matrix_generated_by))
        if matrix_generated_by_url is not None:
            information.write("URL:\t{}\n".format(matrix_generated_by_url))

        if statistics is not None:
            information.write("\n\nBuild statistics:\n{}\n".format(statistics))

        if args.outFileName:
            with open(args.outFileName, 'w') as file:
                file.write(information.getvalue())
        else:
            print(information.getvalue())

        information.close()
def main(args=None):
    """
    for each distance, compare the
    distribution of two samples,
    report number of cases were they differ
    """

    args = parse_arguments().parse_args(args)
    mean_dict = OrderedDict()
    matrix_sum = {}
    if args.labels is None:
        labels = OrderedDict([(x, os.path.basename(x)) for x in args.matrices])
    else:
        labels = OrderedDict(zip(args.matrices, args.labels))

    chroms = set()
    for matrix_file in args.matrices:
        hic_ma = HiCMatrix.hiCMatrix(matrix_file)
        matrix_sum[matrix_file] = hic_ma.matrix.sum()
        if args.chromosomeExclude is None:
            args.chromosomeExclude = []

        chrtokeep = [x for x in list(hic_ma.interval_trees) if x not in args.chromosomeExclude]
        hic_ma.keepOnlyTheseChr(chrtokeep)

        mean_dict[matrix_file] = compute_distance_mean(hic_ma, maxdepth=args.maxdepth, perchr=args.perchr)
        chroms = chroms.union([k for k in list(mean_dict[matrix_file]) if len(mean_dict[matrix_file][k]) > 1])

    # compute scale factors such that values are comparable
    min_sum = min(matrix_sum.values())
    scale_factor = dict([(matrix_file, float(min_sum) / mat_sum) for matrix_file, mat_sum in matrix_sum.items()])
    log.info("The scale factors used are: {}".format(scale_factor))
    if len(args.matrices) > 1 and args.perchr:
        # in this case, for each chromosome a plot is made that combines the data from the
        # hic matrices
        max_cols = 4
        num_rows = int(np.ceil(float(len(chroms)) / max_cols))
        num_cols = min(len(chroms), max_cols)

    else:
        num_cols = num_rows = 1

    if args.plotsize is None:
        width = 6
        height = 4
    else:
        width, height = args.plotsize
    fig = plt.figure(figsize=(width * num_cols, height * num_rows))

    axs = np.empty((num_rows, num_cols), dtype='object')
    for matrix_file in args.matrices:
        idx = 0
        for chrom, mean_values in mean_dict[matrix_file].items():
            if len(mean_values) <= 1:
                log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom))
                continue
            x, y = zip(*[(k, v) for k, v in mean_values.items() if v > 0])
            if len(x) <= 1:
                log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom))
                continue
            if args.perchr and len(args.matrices) == 1:
                col = 0
                row = 0
            else:
                col = idx % num_cols
                row = idx // num_cols
            if axs[row, col] is None:
                ax = plt.subplot2grid((num_rows, num_cols), (row, col))
                ax.set_xlabel('genomic distance')
                ax.set_ylabel('corrected Hi-C counts')
                try:
                    ax.set_yscale('log')
                    ax.set_xscale('log')
                except ValueError:
                    continue
            else:
                ax = axs[row, col]
            y = np.array(y) * scale_factor[matrix_file]
            if args.perchr and len(args.matrices) > 1:
                label = labels[matrix_file]
                ax.set_title(chrom)
            elif args.perchr:
                label = chrom
            else:
                label = labels[matrix_file]

            ax.plot(x, y, label=label)
            axs[row, col] = ax
            idx += 1
            if args.outFileData is not None:
                x_vals = np.stack(x).T
                y_vals = np.stack(y).T
                table_to_export = pd.DataFrame({'Matrix': labels[matrix_file],
                                                'Chromosome': chrom,
                                                'Distance': x_vals,
                                                'Contacts': y_vals})
                table_to_export.to_csv(args.outFileData, sep='\t')

    for ax in axs.reshape(-1):
        if ax is None:
            continue
        ax.legend(prop={'size': 'small'})
        ax.set_xlim(0, args.maxdepth)
        handles, labels = ax.get_legend_handles_labels()
        lgd = ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))

    plt.tight_layout()
    plt.savefig(args.plotFile.name, bbox_inches='tight', bbox_extra_artists=(lgd,))
    plt.close(fig)
Example #25
0
def plotMatrix(matrixinputfile,imageoutputfile, regionindex1, regionindex2, comparematrix, title, bigwig):
        if not checkExtension(matrixinputfile, '.cool'):
            msg = "input matrix must be in cooler format (.cool)"
            raise SystemExit(msg)
        if comparematrix and not checkExtension(comparematrix, ".cool"):
            msg = "if specified, compare matrix must be in cooler format (.cool)"
            raise SystemExit(msg)
        if not imageoutputfile:
            imageoutputfile = matrixinputfile.rstrip('cool') + 'png'
        elif imageoutputfile and not checkExtension(imageoutputfile, ".png"):
            imageoutputfile = os.path.splitext(imageoutputfile)[0] + ".png"
       
        #get the full matrix first to extract the desired region
        ma = hm.hiCMatrix(matrixinputfile)
        cuts = ma.cut_intervals
        chromosome = cuts[0][0]
        maxIndex = len(cuts) - 1
        #check indices and get the region if ok
        if regionindex1 > maxIndex:
            msg = "invalid start region. Allowed is 0 to {0:d} (0 to {1:d})".format(maxIndex, cuts[maxIndex][1])
            raise SystemExit(msg)
        if regionindex2 < regionindex1:
           msg = "region index 2 must be smaller than region index 1"
           raise SystemExit(msg)
        if regionindex2 > maxIndex:
            regionindex2 = maxIndex
            print("region index 2 clamped to max. value {0:d}".format(maxIndex))
        region = str(chromosome) +":"+str(cuts[regionindex1][1])+"-"+ str(cuts[regionindex2][1])
        
        #now get the data for the input matrix, restricted to the desired region
        upperHiCMatrix = hm.hiCMatrix(matrixinputfile ,pChrnameList=[region])
        upperMatrix = triu(upperHiCMatrix.matrix, k=1, format="csr")
        
        #if set, get data from the same region also for the compare matrix
        #there's no compatibility check so far
        lowerHiCMatrix = None
        lowerMatrix = None
        if comparematrix:
            lowerHiCMatrix = hm.hiCMatrix(comparematrix)
            if chromosome not in [row[0] for row in lowerHiCMatrix.cut_intervals]:
                msg = "compare matrix must contain the same chromosome as the input matrix"
                raise SystemExit(msg)
            lowerHiCMatrix = hm.hiCMatrix(comparematrix , pChrnameList=[region])
            lowerMatrix = tril(lowerHiCMatrix.matrix, k=0, format="csr") 

            if lowerMatrix.get_shape() != upperMatrix.get_shape():
                msg = "shapes of input matrix and compare matrix do not match. Check resolutions"
                raise SystemExit(msg)

        #arguments for plotting
        plotArgs = Namespace(bigwig=bigwig, 
                             chromosomeOrder=None, 
                             clearMaskedBins=False, 
                             colorMap='RdYlBu_r', 
                             disable_tight_layout=False, 
                             dpi=300, 
                             flipBigwigSign=False, 
                             log=False, log1p=True, 
                             perChromosome=False, 
                             region=region, 
                             region2=None, 
                             scaleFactorBigwig=1.0, 
                             scoreName=None, 
                             title=title, 
                             vMax=None, vMaxBigwig=None, 
                             vMin=1.0, vMinBigwig=None,
                             matrix = matrixinputfile) 
        
        #following code is largely duplicated from hicPlotMatrix
        #not exactly beautiful, but works for now
        chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = hicPlot.getRegion(plotArgs, upperHiCMatrix)
        

        mixedMatrix = None
        if comparematrix:
            mixedMatrix = np.asarray((lowerMatrix + upperMatrix).todense().astype(float))
        else:
            mixedMatrix = np.asarray(upperHiCMatrix.matrix.todense().astype(float))
        
        #colormap for plotting
        cmap = cm.get_cmap(plotArgs.colorMap) # pylint: disable=no-member
        cmap.set_bad('black')
        
        bigwig_info = None
        if plotArgs.bigwig: # pylint: disable=no-member
            bigwig_info = {'args': plotArgs, 'axis': None, 'axis_colorbar': None, 'nan_bins': upperHiCMatrix.nan_bins}
        norm = None

        if plotArgs.log or plotArgs.log1p: # pylint: disable=no-member
            mask = mixedMatrix == 0
            try:
                mixedMatrix[mask] = np.nanmin(mixedMatrix[mask == False])
            except ValueError:
                log.info('Matrix contains only 0. Set all values to {}'.format(np.finfo(float).tiny))
                mixedMatrix[mask] = np.finfo(float).tiny
            if np.isnan(mixedMatrix).any() or np.isinf(mixedMatrix).any():
                log.debug("any nan {}".format(np.isnan(mixedMatrix).any()))
                log.debug("any inf {}".format(np.isinf(mixedMatrix).any()))
                mask_nan = np.isnan(mixedMatrix)
                mask_inf = np.isinf(mixedMatrix)
                mixedMatrix[mask_nan] = np.nanmin(mixedMatrix[mask_nan == False])
                mixedMatrix[mask_inf] = np.nanmin(mixedMatrix[mask_inf == False])

        log.debug("any nan after remove of nan: {}".format(np.isnan(mixedMatrix).any()))
        log.debug("any inf after remove of inf: {}".format(np.isinf(mixedMatrix).any()))
        if plotArgs.log1p: # pylint: disable=no-member
            mixedMatrix += 1
            norm = LogNorm()
        elif plotArgs.log: # pylint: disable=no-member 
            norm = LogNorm()

        if plotArgs.bigwig: # pylint: disable=no-member
            # increase figure height to accommodate bigwig track
            fig_height = 8.5
        else:
            fig_height = 7
        height = 4.8 / fig_height
        
        fig_width = 8
        width = 5.0 / fig_width
        left_margin = (1.0 - width) * 0.5

        fig = plt.figure(figsize=(fig_width, fig_height), dpi=plotArgs.dpi) # pylint: disable=no-member

        if plotArgs.bigwig: # pylint: disable=no-member
            gs = gridspec.GridSpec(2, 2, height_ratios=[0.90, 0.1], width_ratios=[0.97, 0.03])
            gs.update(hspace=0.05, wspace=0.05)
            ax1 = plt.subplot(gs[0, 0])
            ax2 = plt.subplot(gs[1, 0])
            ax3 = plt.subplot(gs[0, 1])
            bigwig_info['axis'] = ax2
            bigwig_info['axis_colorbar'] = ax3
        else:
            ax1 = None
        
        bottom = 1.3 / fig_height

        position = [left_margin, bottom, width, height]
        hicPlot.plotHeatmap(mixedMatrix, ma.get_chromosome_sizes(), fig, position,
                    plotArgs, cmap, xlabel=chrom, ylabel=chrom2,
                    start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=ax1, pBigwig=bigwig_info)
        plt.savefig(imageoutputfile, dpi=plotArgs.dpi) # pylint: disable=no-member
        plt.close(fig)
Example #26
0
def main(args=None):

    args = parse_arguments().parse_args(args)
    short_v_long_range = []
    sum_smaller = []
    sum_greater = []
    for matrix in args.matrices:

        is_cooler = check_cooler(matrix)
        if not is_cooler:
            hic_matrix = hm.hiCMatrix(matrix)
        else:
            hic_matrix = matrix
        if args.chromosomes is None:
            # get all chromosomes from cooler file
            if not is_cooler:
                chromosomes_list = list(hic_matrix.chrBinBoundaries)
            else:
                chromosomes_list = cooler.Cooler(matrix).chromnames
        else:
            chromosomes_list = args.chromosomes

        short_v_long_range_matrix_threads = [None] * args.threads
        sum_smaller_threads = [None] * args.threads
        sum_greater_threads = [None] * args.threads

        chromosomesListPerThread = len(chromosomes_list) // args.threads
        all_data_collected = False
        queue = [None] * args.threads
        process = [None] * args.threads
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                chromosomeListThread = chromosomes_list[
                    i * chromosomesListPerThread:(i + 1) *
                    chromosomesListPerThread]
            else:
                chromosomeListThread = chromosomes_list[
                    i * chromosomesListPerThread:]

            queue[i] = Queue()
            process[i] = Process(target=compute_relation_short_long_range,
                                 kwargs=dict(pHiCMatrix=hic_matrix,
                                             pChromosomes=chromosomeListThread,
                                             pDistance=args.distance,
                                             pIsCooler=is_cooler,
                                             pQueue=queue[i]))

            process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    short_v_long_range_matrix_threads[i], sum_smaller_threads[
                        i], sum_greater_threads[i] = queue[i].get()
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        short_v_long_range_matrix = [
            item for sublist in short_v_long_range_matrix_threads
            for item in sublist
        ]
        sum_smaller_matrix = [
            item for sublist in sum_smaller_threads for item in sublist
        ]
        sum_greater_matrix = [
            item for sublist in sum_greater_threads for item in sublist
        ]

        short_v_long_range.append(short_v_long_range_matrix)
        sum_smaller.append(sum_smaller_matrix)
        sum_greater.append(sum_greater_matrix)

    log.debug(short_v_long_range)
    plt.ylabel('Sum short range / long range')
    plt.tick_params(axis='x',
                    which='both',
                    bottom=False,
                    top=False,
                    labelbottom=False)

    box_plot = plt.boxplot(short_v_long_range, patch_artist=True)
    legend_handels_color = []
    for i, patch in enumerate(box_plot['boxes']):
        patch.set_facecolor(args.colorList[i % len(args.colorList)])
        legend_handels_color.append(
            mpatches.Patch(color=args.colorList[i % len(args.colorList)],
                           label=args.matrices[i].split('/')[-1]))
    plt.legend(handles=legend_handels_color)
    plt.savefig(args.plotFileName, dpi=args.dpi)

    if len(args.matrices) > 1:
        p_values = []
        for i, sample in enumerate(short_v_long_range):
            for sample2 in short_v_long_range[i + 1:]:
                statistic, significance_level = ranksums(sample, sample2)
                p_values.append(significance_level)
        log.debug('p_values {}'.format(p_values))
        with open(args.outFileName, 'w') as file:
            header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n'
            header += "# Short range vs long range contacts per chromosome, p-values of each distribution against each other distribution with Wilcoxon rank-sum\n"
            header += '# Short range contacts: <= ' + str(args.distance) + '\n'
            file.write(header)
            counter = 0
            for i, matrix_0 in enumerate(args.matrices):
                for j, matrix_1 in enumerate(args.matrices[i + 1:]):
                    file.write(matrix_0 + '\t' + matrix_1 + '\t' +
                               str(p_values[counter]) + '\n')
                    counter += 1

    with open(args.outFileNameData, 'w') as file:
        header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n'
        header += "# Short range vs long range contacts per chromosome: raw data\n"
        header += '# Short range contacts: <= ' + str(args.distance) + '\n'
        matrices_names = '\t\t\t'.join(args.matrices)
        header += '#\t{}\n'.format(matrices_names)
        header += '# Chromosome\t'
        header += '\t'.join([
            'Ratio', 'Sum <= {}'.format(args.distance), 'Sum > {}'.format(
                args.distance)
        ] * len(args.matrices))
        header += '\n'
        file.write(header)
        counter = 0
        for i, chromosome in enumerate(chromosomes_list):
            file.write('{}\t'.format(chromosome))
            for j, matrix in enumerate(args.matrices):
                if i < len(short_v_long_range[j]):
                    file.write('{}\t{}\t{}\t'.format(short_v_long_range[j][i],
                                                     sum_smaller[j][i],
                                                     sum_greater[j][i]))
                else:
                    file.write('\t')

            file.write('\n')
Example #27
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    viewpointObj = Viewpoint()
    referencePoints, _ = viewpointObj.readReferencePointFile(
        args.referencePoints)

    relative_positions = set()
    bin_size = 0

    # - compute for each condition (matrix):
    # - all viewpoints and smooth them: sliding window approach
    # - after smoothing, sum all viewpoints up to one
    # - compute the percentage of each position with respect to the total interaction count
    # for models of all conditions:
    # - compute nbinom parameters

    referencePointsPerThread = len(referencePoints) // args.threads
    queue = [None] * args.threads
    process = [None] * args.threads
    background_model_data = None
    fail_flag = False
    fail_message = ''

    for matrix in args.matrices:
        hic_ma = hm.hiCMatrix(matrix)
        viewpointObj.hicMatrix = hic_ma

        bin_size = hic_ma.getBinSize()
        all_data_collected = False
        thread_done = [False] * args.threads
        for i in range(args.threads):

            if i < args.threads - 1:
                referencePointsThread = referencePoints[i * referencePointsPerThread:(i + 1) * referencePointsPerThread]
            else:
                referencePointsThread = referencePoints[i * referencePointsPerThread:]

            queue[i] = Queue()
            process[i] = Process(target=compute_background, kwargs=dict(
                pReferencePoints=referencePointsThread,
                pViewpointObj=viewpointObj,
                pArgs=args,
                pQueue=queue[i]
            )
            )

            process[i].start()

        while not all_data_collected:
            for i in range(args.threads):
                if queue[i] is not None and not queue[i].empty():
                    background_data_thread = queue[i].get()
                    if 'Fail:' in background_data_thread:
                        fail_flag = True
                        fail_message = background_data_thread[6:]
                        queue[i] = None
                        process[i].join()
                        process[i].terminate()
                        process[i] = None
                        thread_done[i] = True
                        continue
                    background_model_data_thread, relative_positions_thread = background_data_thread
                    if background_model_data is None:
                        background_model_data = background_model_data_thread
                    else:
                        for relativePosition in background_model_data_thread:
                            if relativePosition in background_model_data:
                                background_model_data[relativePosition].extend(
                                    background_model_data_thread[relativePosition])
                            else:
                                background_model_data[relativePosition] = background_model_data_thread[relativePosition]

                    relative_positions = relative_positions.union(
                        relative_positions_thread)
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        del hic_ma
        del viewpointObj.hicMatrix

    if fail_flag:
        log.error('An error occurred caused by one or many faulty reference points.')
        log.error('Please run chicQualityControl to remove these from your reference point file: {}'.format(args.referencePoints))
        log.error(fail_message)
        exit(1)
    # for models of all conditions:
    # - fit negative binomial for each relative distance
    relative_positions = sorted(relative_positions)
    nbinom_parameters = {}
    max_value = {}
    mean_value = {}
    sum_all_values = 0
    data_of_distribution = None
    for relative_position in relative_positions:

        if args.truncateZeros:
            data_of_distribution = np.array(background_model_data[relative_position])
            mask = data_of_distribution > 0.0
            data_of_distribution = data_of_distribution[mask]
        else:
            data_of_distribution = np.array(background_model_data[relative_position])
        nbinom_parameters[relative_position] = fit_nbinom.fit(data_of_distribution)

        if len(data_of_distribution) > 0:
            max_value[relative_position] = np.max(data_of_distribution)
            average_value = np.average(data_of_distribution)
            mean_value[relative_position] = average_value
            sum_all_values += average_value
        else:
            max_value[relative_position] = 0.0
            average_value = 0.0
            mean_value[relative_position] = 0.0
            sum_all_values += 0.0

    for relative_position in relative_positions:
        mean_value[relative_position] /= sum_all_values
    # write result to file
    with open(args.outFileName, 'w') as file:
        file.write(
            'Relative position\tsize nbinom\tprob nbinom\tmax value\tmean value\n')

        for relative_position in relative_positions:
            relative_position_in_genomic_scale = relative_position * bin_size
            file.write("{}\t{:.12f}\t{:.12f}\t{:.12f}\t{:.12f}\n".format(relative_position_in_genomic_scale, nbinom_parameters[relative_position]['size'],
                                                                         nbinom_parameters[relative_position]['prob'], max_value[relative_position], mean_value[relative_position]))
Example #28
0
def main(args=None):

    args = parse_arguments().parse_args(args)
    hic_matrix_list = []
    sum_list = []
    for matrix in args.matrices:
        hic_ma = hm.hiCMatrix(matrix)
        if args.normalize == 'smallest':
            sum_list.append(hic_ma.matrix.sum())
        hic_matrix_list.append(hic_ma)

    if args.normalize == 'norm_range':
        for i, hic_matrix in enumerate(hic_matrix_list):
            hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32)
            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            min_value = np.min(hic_matrix.matrix.data)
            max_value = np.max(hic_matrix.matrix.data)
            min_max_difference = np.float64(max_value - min_value)

            hic_matrix.matrix.data -= min_value
            hic_matrix.matrix.data /= min_max_difference

            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            hic_matrix.matrix.eliminate_zeros()

            hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
    elif args.normalize == 'smallest':
        argmin = np.argmin(sum_list)

        for i, hic_matrix in enumerate(hic_matrix_list):
            hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32)
            if i != argmin:
                mask = np.isnan(hic_matrix.matrix.data)
                hic_matrix.matrix.data[mask] = 0

                mask = np.isinf(hic_matrix.matrix.data)
                hic_matrix.matrix.data[mask] = 0
                adjust_factor = sum_list[i] / sum_list[argmin]
                hic_matrix.matrix.data /= adjust_factor
                mask = np.isnan(hic_matrix.matrix.data)

            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            hic_matrix.matrix.eliminate_zeros()

            hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
    elif args.normalize == 'multiplicative':

        for i, hic_matrix in enumerate(hic_matrix_list):
            hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32)

            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            # adjust_factor = sum_list[i] / sum_list[argmin]
            hic_matrix.matrix.data *= args.multiplicativeValue
            mask = np.isnan(hic_matrix.matrix.data)

            mask = np.isnan(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0

            mask = np.isinf(hic_matrix.matrix.data)
            hic_matrix.matrix.data[mask] = 0
            hic_matrix.matrix.eliminate_zeros()

            hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
Example #29
0
def test_maskBins():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    new_matrix = np.array([[0, 0, 2], [0, 0, 1], [0, 0, 0]])

    masking_ids = [0, 1]
    hic.maskBins(masking_ids)

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(
        sorted(hic.orig_cut_intervals),
        sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(
        sorted(hic.cut_intervals),
        sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('a', (0, 1)), ('b', (1, 3))]))
    nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))

    # direct return if masking_ids is None or has len() == 0, thus no changes to matrix
    masking_ids = None
    hic.maskBins(masking_ids)

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(
        sorted(hic.orig_cut_intervals),
        sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(
        sorted(hic.cut_intervals),
        sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('a', (0, 1)), ('b', (1, 3))]))

    masking_ids = []

    hic.maskBins(masking_ids)

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(
        sorted(hic.orig_cut_intervals),
        sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(
        sorted(hic.cut_intervals),
        sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)]))
    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('a', (0, 1)), ('b', (1, 3))]))

    nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))
def main(args=None):
    args = parse_arguments().parse_args(args)

    # read domains file
    domains_df = readDomainBoundaries(args.tadDomains)
    # read full h5 or only region if cooler
    is_cooler_target = check_cooler(args.targetMatrix)
    is_cooler_control = check_cooler(args.controlMatrix)

    if is_cooler_target != is_cooler_control:
        log.error('Matrices are not given in the same format!')
        exit(1)
    if not is_cooler_control:
        hic_matrix_target = hm.hiCMatrix(args.targetMatrix)
        hic_matrix_control = hm.hiCMatrix(args.controlMatrix)
    else:
        hic_matrix_target = args.targetMatrix
        hic_matrix_control = args.controlMatrix
    # accepted_H0 = []
    # rejected_H0 = []
    # log.debug('domains_df {}'.format(domains_df))
    domains = domains_df.values.tolist()

    p_values_threads = [None] * args.threads
    accepted_left_inter_threads = [None] * args.threads
    accepted_right_inter_threads = [None] * args.threads
    accepted_intra_threads = [None] * args.threads
    rows_threads = [None] * args.threads

    domainsPerThread = len(domains) // args.threads
    all_data_collected = False
    queue = [None] * args.threads
    process = [None] * args.threads
    thread_done = [False] * args.threads

    # None --> first thread, process first element in list, ignore last one
    # True --> middle thread: ignore first and last element in tad processing
    # False --> last thread: ignore first element, process last one
    thread_id = None
    for i in range(args.threads):

        if i == 0:
            domainListThread = domains[i * domainsPerThread:(
                (i + 1) * domainsPerThread) + 1]
            thread_id = None
        elif i < args.threads - 1:
            domainListThread = domains[(i * domainsPerThread) -
                                       1:((i + 1) * domainsPerThread) + 1]
            thread_id = True

        else:
            domainListThread = domains[(i * domainsPerThread) - 1:]
            thread_id = False

        if args.threads == 1:
            thread_id = ''
        queue[i] = Queue()
        process[i] = Process(target=computeDifferentialTADs,
                             kwargs=dict(pMatrixTarget=hic_matrix_target,
                                         pMatrixControl=hic_matrix_control,
                                         pDomainList=domainListThread,
                                         pCoolOrH5=is_cooler_control,
                                         pPValue=args.pValue,
                                         pThreadId=thread_id,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(args.threads):
            if queue[i] is not None and not queue[i].empty():
                p_values_threads[i], accepted_left_inter_threads[i], \
                    accepted_right_inter_threads[i], \
                    accepted_intra_threads[i], rows_threads[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    # outfile_names = [item for sublist in outfile_names for item in sublist]
    # target_list_name = [
    #     item for sublist in target_list_name for item in sublist]

    p_values_list = [item for sublist in p_values_threads for item in sublist]
    accepted_inter_left = [
        item for sublist in accepted_left_inter_threads for item in sublist
    ]
    accepted_inter_right = [
        item for sublist in accepted_right_inter_threads for item in sublist
    ]
    accepted_intra = [
        item for sublist in accepted_intra_threads for item in sublist
    ]
    rows = [item for sublist in rows_threads for item in sublist]

    p_values_list = np.array(p_values_list)
    accepted_inter_left = np.array(accepted_inter_left)
    accepted_inter_right = np.array(accepted_inter_right)
    accepted_intra = np.array(accepted_intra)
    rows = np.array(rows)

    if args.mode == 'intra-TAD':
        mask = accepted_intra
    elif args.mode == 'left-inter-TAD':
        if args.modeReject == 'all':
            mask = np.logical_and(accepted_inter_left, accepted_intra)
        else:
            mask = np.logical_or(accepted_inter_left, accepted_intra)

    elif args.mode == 'right-inter-TAD':
        if args.modeReject == 'all':
            mask = np.logical_and(accepted_intra, accepted_inter_right)
        else:
            mask = np.logical_or(accepted_intra, accepted_inter_right)

    else:
        if args.modeReject == 'all':
            mask = np.logical_and(accepted_inter_left, accepted_inter_right)
            mask = np.logical_and(mask, accepted_intra)
        else:
            mask = np.logical_or(accepted_inter_left, accepted_inter_right)
            mask = np.logical_or(mask, accepted_intra)

    accepted_H0 = p_values_list[~mask]
    rejected_H0 = p_values_list[mask]
    accepted_rows = rows[~mask]
    rejected_rows = rows[mask]
    with open(args.outFileNamePrefix + '_accepted.diff_tad', 'w') as file:
        header = '# Created with HiCExplorer\'s hicDifferentialTAD version ' + __version__ + '\n'
        header += '# H0 \'regions are equal\' H0 is accepted for all p-value greater the user given p-value threshold; i.e. regions in this file are not considered as differential.\n'
        header += '# Accepted regions with Wilcoxon rank-sum test to p-value: {}  with used mode: {} and modeReject: {} \n'.format(
            args.pValue, args.mode, args.modeReject)
        header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tp-value left-inter-TAD\tp-value right-inter-TAD\tp-value intra-TAD\n'
        file.write(header)
        for i, row in enumerate(accepted_rows):
            row_list = list(map(str, row))
            file.write('\t'.join(row_list))
            file.write('\t')
            pvalue_list = list(map(str, accepted_H0[i]))
            file.write('\t'.join(pvalue_list))

            file.write('\n')
    with open(args.outFileNamePrefix + '_rejected.diff_tad', 'w') as file:
        header = '# Created with HiCExplorer\'s hicDifferentialTAD version ' + __version__ + '\n'
        header += '# H0 \'regions are equal\' H0 is rejected for all p-value smaller or equal the user given p-value threshold; i.e. regions in this file are considered as differential.\n'
        header += '# Rejected regions with Wilcoxon rank-sum test to p-value: {}  with used mode: {} and modeReject: {} \n'.format(
            args.pValue, args.mode, args.modeReject)
        header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tp-value left-inter-TAD\tp-value right-inter-TAD\tp-value intra-TAD\n'

        file.write(header)

        for i, row in enumerate(rejected_rows):
            row_list = list(map(str, row))
            file.write('\t'.join(row_list))
            file.write('\t')
            pvalue_list = list(map(str, rejected_H0[i]))
            file.write('\t'.join(pvalue_list))
            file.write('\n')