コード例 #1
0
ファイル: hicPCA.py プロジェクト: tw7649116/HiCExplorer
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error("Number of output file names and number of eigenvectors"
                  " does not match. Please"
                  "provide the name of each file.\nFiles: {}\nNumber of "
                  "eigenvectors: {}".format(args.outputFileName,
                                            args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.ignoreMaskedBins:
        # ma.maskBins(ma.nan_bins)
        new_intervals = enlarge_bins(ma.cut_intervals)
        ma.setCutIntervals(new_intervals)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        transf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        transf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')):
        bwTrack = pyBigWig.open(args.extraTrack, 'r')
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.method == 'lieberman':
            obs_exp_matrix_ = obs_exp_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
        else:
            obs_exp_matrix_ = obs_exp_matrix_non_zero(submatrix, args.ligation_factor)

        obs_exp_matrix_ = csr_matrix(obs_exp_matrix_).todense()
        if args.obsexpMatrix:
            transf_matrix_obsexp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(obs_exp_matrix_)
        pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_)
        pearson_correlation_matrix = convertNansToZeros(csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            transf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = pearson_correlation_matrix
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(*ma.cut_intervals[chr_range[0]:chr_range[1]])

        chrom_list += chrom
        start_list += start
        end_list += end
        if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')):
            assert(len(end) == len(start))
            correlateEigenvectorWithHistonMarkTrack(eigs[:, :k].transpose(),
                                                    bwTrack, chrname, start,
                                                    end, args.extraTrack,
                                                    args.histonMarkType)

        vecs_list += eigs[:, :k].tolist()

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(transf_matrix_pearson.tocsr(),
                                                     ma.cut_intervals,
                                                     ma.nan_bins,
                                                     ma.correction_factors,
                                                     ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix, pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(transf_matrix_obsexp.tocsr(),
                                                     ma.cut_intervals,
                                                     ma.nan_bins,
                                                     ma.correction_factors,
                                                     ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix, pSymmetric=True,
                                     pApplyCorrection=False)

    if args.extraTrack and not args.extraTrack.endswith('.bw') and not args.extraTrack.endswith('.bigwig'):
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.extraTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert(len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(toString(chrom_list[i]), start_list[i], end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error("ERROR: Your version of pyBigWig is not supporting "
                      "numpy: {}".format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert(len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list, _start_list, ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
コード例 #2
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.title:
        args.title = remove_non_ascii(args.title)

    chrom = None
    start_pos1 = None
    chrom2 = None
    start_pos2 = None

    if args.perChromosome and args.region:
        log.error('ERROR, choose from the option '
                  '--perChromosome or --region, the two '
                  'options at the same time are not '
                  'compatible.')
        exit(1)

    # if args.region and args.region2 and args.bigwig:
    #     log.error("Inter-chromosomal pca is not supported.")
    #     exit(1)
    # is_cooler = False
    # if args.matrix.endswith('.cool') or cooler.io.is_cooler(args.matrix) or'.mcool' in args.matrix:
    is_cooler = check_cooler(args.matrix)
    log.debug("Cooler or no cooler: {}".format(is_cooler))
    open_cooler_chromosome_order = True
    if args.chromosomeOrder is not None and len(args.chromosomeOrder) > 1:
        open_cooler_chromosome_order = False

    if is_cooler and not args.region2 and open_cooler_chromosome_order:
        log.debug("Retrieve data from cooler format and use its benefits.")
        regionsToRetrieve = None
        if args.region:
            regionsToRetrieve = []
            regionsToRetrieve.append(args.region)
            # if args.region2:
            #     chrom2, region_start2, region_end2 = translate_region(args.region2)
            #     regionsToRetrieve.append(args.region2)
        if args.chromosomeOrder:
            args.region = None
            args.region2 = None
            regionsToRetrieve = args.chromosomeOrder

        ma = HiCMatrix.hiCMatrix(args.matrix, pChrnameList=regionsToRetrieve)
        log.debug('Shape {}'.format(ma.matrix.shape))
        if args.clearMaskedBins:
            ma.maskBins(ma.nan_bins)
            # to avoid gaps in the plot, bins flanking the masked bins
            # are enlarged
            new_intervals = enlarge_bins(ma.cut_intervals)
            ma.setCutIntervals(new_intervals)

        if args.region:
            chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, ma)

        matrix = np.asarray(ma.matrix.todense().astype(float))
        matrix_length = len(matrix[0])
        log.debug("Number of data points matrix_cool: {}".format(matrix_length))
    else:
        ma = HiCMatrix.hiCMatrix(args.matrix)
        if args.clearMaskedBins:
            ma.maskBins(ma.nan_bins)
            new_intervals = enlarge_bins(ma.cut_intervals)
            ma.setCutIntervals(new_intervals)
        if args.chromosomeOrder:
            args.region = None
            args.region2 = None

            valid_chromosomes = []
            invalid_chromosomes = []
            log.debug('args.chromosomeOrder: {}'.format(args.chromosomeOrder))
            log.debug("ma.chrBinBoundaries {}".format(ma.chrBinBoundaries))
            if sys.version_info[0] == 3:
                args.chromosomeOrder = toBytes(args.chromosomeOrder)
            for chrom in toString(args.chromosomeOrder):
                if chrom in ma.chrBinBoundaries:
                    valid_chromosomes.append(chrom)
                else:
                    invalid_chromosomes.append(chrom)

            if len(invalid_chromosomes) > 0:
                log.warning("WARNING: The following chromosome/scaffold names were not found. Please check"
                            "the correct spelling of the chromosome names. \n")
                log.warning("\n".join(invalid_chromosomes))
            ma.reorderChromosomes(valid_chromosomes)

        log.info("min: {}, max: {}\n".format(ma.matrix.data.min(), ma.matrix.data.max()))

        if args.region:
            chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, ma)

            matrix = np.asarray(ma.matrix[idx1, :][:, idx2].todense().astype(float))

        else:
            log.debug("Else branch")
            matrix = np.asarray(ma.getMatrix().astype(float))

    matrix_length = len(matrix[0])
    log.debug("Number of data points matrix: {}".format(matrix_length))

    for matrix_ in matrix:
        if not matrix_length == len(matrix_):
            log.error("Matrices do not have the same length: {} , {}".format(matrix_length, len(matrix_)))

    cmap = cm.get_cmap(args.colorMap)
    log.debug("Nan values set to black\n")
    cmap.set_bad('black')

    bigwig_info = None
    if args.bigwig:
        bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': ma.nan_bins}

    if args.perChromosome:
        fig = plotPerChr(ma, cmap, args, pBigwig=bigwig_info)

    else:
        norm = None

        if args.log or args.log1p:
            mask = matrix == 0
            matrix[mask] = np.nanmin(matrix[mask == False])

            if np.isnan(matrix).any() or np.isinf(matrix).any():
                log.debug("any nan {}".format(np.isnan(matrix).any()))
                log.debug("any inf {}".format(np.isinf(matrix).any()))
                mask_nan = np.isnan(matrix)
                mask_inf = np.isinf(matrix)
                matrix[mask_nan] = np.nanmin(matrix[mask_nan == False])
                matrix[mask_inf] = np.nanmin(matrix[mask_inf == False])

        log.debug("any nan after remove of nan: {}".format(np.isnan(matrix).any()))
        log.debug("any inf after remove of inf: {}".format(np.isinf(matrix).any()))
        if args.log1p:
            matrix += 1
            norm = LogNorm()
        elif args.log:
            norm = LogNorm()

        if args.bigwig:
            # increase figure height to accommodate bigwig track
            fig_height = 8.5
        else:
            fig_height = 7
        height = 4.8 / fig_height

        fig_width = 8
        width = 5.0 / fig_width
        left_margin = (1.0 - width) * 0.5

        fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi)

        if args.bigwig:
            gs = gridspec.GridSpec(2, 2, height_ratios=[0.90, 0.1], width_ratios=[0.97, 0.03])
            gs.update(hspace=0.05, wspace=0.05)
            ax1 = plt.subplot(gs[0, 0])
            ax2 = plt.subplot(gs[1, 0])
            ax3 = plt.subplot(gs[0, 1])
            bigwig_info['axis'] = ax2
            bigwig_info['axis_colorbar'] = ax3
        else:
            ax1 = None
        bottom = 1.3 / fig_height

        if start_pos1 is None:
            start_pos1 = make_start_pos_array(ma)

        position = [left_margin, bottom, width, height]
        plotHeatmap(matrix, ma.get_chromosome_sizes(), fig, position,
                    args, cmap, xlabel=chrom, ylabel=chrom2,
                    start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=ax1, pBigwig=bigwig_info)

    if not args.disable_tight_layout:
        if args.perChromosome or args.bigwig:
            try:
                plt.tight_layout()
            except UserWarning:
                log.info("Failed to tight layout. Using regular plot.")
            except ValueError:
                log.info("Failed to tight layout. Using regular plot.")

    plt.savefig(args.outFileName, dpi=args.dpi)
    plt.close(fig)
コード例 #3
0
ファイル: hicFindTADs.py プロジェクト: cxlsky/HiCExplorer
def compute_spectra_matrix(args, matrix=None):

    if args.maxDepth is not None and args.minDepth is not none and args.maxDepth <= args.minDepth:
        exit("Please check that maxDepth is larger than minDepth.")

    global hic_ma
    if matrix is not None:
        hic_ma = matrix
    else:
        hic_ma = hm.hiCMatrix(args.matrix)

    # remove self counts
    log.info('removing diagonal values\n')
    hic_ma.diagflat(value=0)

    # mask bins without any information
    hic_ma.maskBins(hic_ma.nan_bins)
    orig_intervals = hic_ma.cut_intervals

    binsize = hic_ma.getBinSize()

    if args.maxDepth is None:
        if binsize < 1000:
            args.maxDepth = binsize * 60
        elif 1000 <= binsize < 20000:
            args.maxDepth = binsize * 20
        else:
            args.maxDepth = binsize * 10
    elif args.maxDepth < binsize * 5:
        sys.error(
            "Please specify a --maxDepth that is at least 5 times larger than the matrix bin size"
        )
        exit(1)

    if args.minDepth is None:
        if binsize < 1000:
            args.minDepth = binsize * 30
        elif 1000 <= binsize < 20000:
            args.minDepth = binsize * 10
        else:
            args.minDepth = binsize * 5
    elif args.minDepth < binsize * 3:
        log.error(
            "Please specify a --minDepth that is at least 3 times larger than the matrix bin size"
        )
        exit(1)

    if args.step is None:
        if binsize < 1000:
            args.step = binsize * 4
        else:
            args.step = binsize * 2

    elif args.step < binsize:
        log.error(
            "Please specify a --step that is at least the size of the matrix bin size"
        )
        exit(1)

    args.binsize = binsize
    print_args(args)

    # use zscore matrix
    log.info("Computing z-score matrix...\n")
    hic_ma.convert_to_zscore_matrix(maxdepth=args.maxDepth * 2.5, perchr=True)

    # extend remaining bins to remove gaps in
    # the matrix
    new_intervals = enlarge_bins(hic_ma.cut_intervals)

    # rebuilt bin positions if necessary

    if new_intervals != orig_intervals:
        hic_ma.interval_trees, hic_ma.chrBinBoundaries = \
            hic_ma.intervalListToIntervalTree(new_intervals)
        hic_ma.cut_intervals = new_intervals
        hic_ma.orig_bin_ids = None
        hic_ma.nan_bins = None

    hic_ma.save(args.outFileName + "_zscore_matrix.h5")

    if args.minDepth % hic_ma.getBinSize() != 0:
        log.warn('Warning. specified *depth* is not multiple of the '
                 'hi-c matrix bin size ({})\n'.format(hic_ma.getBinSize()))
    if args.step % hic_ma.getBinSize() != 0:
        log.warn('Warning. specified *step* is not multiple of the '
                 'hi-c matrix bin size ({})\n'.format(hic_ma.getBinSize()))

    binsize = hic_ma.getBinSize()

    log.info("Computing TAD-separation scores...\n")
    min_depth_in_bins = int(args.minDepth / binsize)
    max_depth_in_bins = int(args.maxDepth / binsize)
    step_in_bins = int(args.step / binsize)
    if step_in_bins == 0:
        exit("Please select a step size larger than {}".format(binsize))

    incremental_step = get_incremental_step_size(args.minDepth, args.maxDepth,
                                                 args.step)

    log.info("computing spectrum for window sizes between {} ({} bp)"
             "and {} ({} bp) at the following window sizes {} {}\n".format(
                 min_depth_in_bins, binsize * min_depth_in_bins,
                 max_depth_in_bins, binsize * max_depth_in_bins, step_in_bins,
                 incremental_step))
    if min_depth_in_bins <= 1:
        log.error(
            'ERROR\nminDepth length too small. Use a value that is at least '
            'twice as large as the bin size which is: {}\n'.format(binsize))
        exit(0)

    if max_depth_in_bins <= 1:
        log.error(
            'ERROR\nmaxDepth length too small. Use a value that is larger '
            'than the bin size which is: {}\n'.format(binsize))
        exit(0)

    # work only with the upper matrix
    # and remove all pixels that are beyond
    # 2 * max_depth_in_bis which are not required
    # (this is done by subtracting a second sparse matrix
    # that contains only the upper matrix that wants to be removed.
    limit = 2 * max_depth_in_bins
    hic_ma.matrix = sparse.triu(hic_ma.matrix, k=0,
                                format='csr') - sparse.triu(
                                    hic_ma.matrix, k=limit, format='csr')
    hic_ma.matrix.eliminate_zeros()

    num_processors = args.numberOfProcessors

    func = compute_matrix_wrapper
    TASKS = []
    bins_to_consider = []
    for chrom in hic_ma.chrBinBoundaries.keys():
        bins_to_consider.extend(range(*hic_ma.chrBinBoundaries[chrom]))

    for idx_array in np.array_split(bins_to_consider, num_processors):
        TASKS.append((idx_array, args.minDepth, args.maxDepth, args.step))

    if num_processors > 1:
        pool = multiprocessing.Pool(num_processors)
        log.info("Using {} processors\n".format(num_processors))
        res = pool.map_async(func, TASKS).get(9999999)
    else:
        res = map(func, TASKS)

    chrom = []
    chr_start = []
    chr_end = []
    matrix = []
    for _chrom, _chr_start, _chr_end, _matrix in res:
        chrom.extend(_chrom)
        chr_start.extend(_chr_start)
        chr_end.extend(_chr_end)
        matrix.append(_matrix)

    matrix = np.vstack(matrix)
    return np.array(chrom), np.array(chr_start).astype(int), np.array(
        chr_end).astype(int), matrix