Beispiel #1
0
def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount):

    obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome,
                                               pChromosomeCount)
    obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_))
    obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)).todense()
    return obs_exp_matrix_
Beispiel #2
0
def _pearson(pSubmatrix):
    pearson_correlation_matrix = np.corrcoef(pSubmatrix)
    pearson_correlation_matrix = convertNansToZeros(
        csr_matrix(pearson_correlation_matrix))
    pearson_correlation_matrix = convertInfsToZeros(
        csr_matrix(pearson_correlation_matrix)).todense()
    return pearson_correlation_matrix
Beispiel #3
0
def _pearson(pSubmatrix):
    pearson_correlation_matrix = np.corrcoef(pSubmatrix)
    pearson_correlation_matrix = convertNansToZeros(csr_matrix(pearson_correlation_matrix))
    pearson_correlation_matrix = convertInfsToZeros(csr_matrix(pearson_correlation_matrix))
    # if len(pearson_correlation_matrix.data) == 0:
    # return np.array([[]])
    return pearson_correlation_matrix  # .todense()
Beispiel #4
0
def __obs_exp(pSubmatrix, pLengthChromosome, pChromosomeCount):

    exp_obs_matrix_ = exp_obs_matrix_lieberman(pSubmatrix, pLengthChromosome,
                                               pChromosomeCount)
    exp_obs_matrix_ = convertNansToZeros(csr_matrix(exp_obs_matrix_))
    exp_obs_matrix_ = convertInfsToZeros(csr_matrix(exp_obs_matrix_)).todense()
    return exp_obs_matrix_
Beispiel #5
0
def _obs_exp_non_zero(pSubmatrix, ligation_factor):

    obs_exp_matrix_ = obs_exp_matrix_non_zero(pSubmatrix, ligation_factor)
    obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_))
    obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_))
    # if len(obs_exp_matrix_.data) == 0:
    # return np.array([[]])
    return obs_exp_matrix_  # .todense()
Beispiel #6
0
def _obs_exp(pSubmatrix):

    obs_exp_matrix_ = obs_exp_matrix(pSubmatrix)
    obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_))
    obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_))
    # if len(obs_exp_matrix_.data) == 0:
    # return np.array([[]])
    return obs_exp_matrix_  # .todense()
Beispiel #7
0
def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount):

    obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount)
    obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_))
    obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_))
    # if len(obs_exp_matrix_.data) == 0:
    #     return np.array()
    return obs_exp_matrix_  # .todense()
def get_expected_matrix(pSubmatrix):
    expected_interactions_in_distance = expected_interactions(pSubmatrix)
    row, col = pSubmatrix.nonzero()
    distance = np.ceil(np.absolute(row - col) / 2).astype(np.int32)
    expected = expected_interactions_in_distance[distance]
    pSubmatrix.data = expected
    pSubmatrix = convertNansToZeros(csr_matrix(pSubmatrix))
    pSubmatrix = convertInfsToZeros(csr_matrix(pSubmatrix)).todense()

    return pSubmatrix
Beispiel #9
0
def _obs_exp(pSubmatrix):

    obs_exp_matrix_ = obs_exp_matrix(pSubmatrix)
    obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_))
    obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_))
    # log.error('obs_exp_matrix_.data {}'.format(obs_exp_matrix_.data))
    # if len(obs_exp_matrix_.data) == 0:
    #     log.debug('No data!')
    #     return np.array([[]])
    return obs_exp_matrix_  # .todense()
Beispiel #10
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        trasf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        trasf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.norm:
            exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome,
                                                  chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        else:
            exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        if args.obsexpMatrix:
            trasf_matrix_obsexp[chr_range[0]:chr_range[1],
                                chr_range[0]:chr_range[1]] = lil_matrix(
                                    exp_obs_matrix_)

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.geneTrack:
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list,
                                                      args.geneTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list,
                          _start_list,
                          ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(
            args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    if 'correctionMethod' in args:
        if args.correctionMethod == 'ICE':
            row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
            log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
            ma.maskBins(np.flatnonzero(row_sum == 0))
            matrix_shape = ma.matrix.shape
    if 'plotName' in args:
        row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
        log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
        ma.maskBins(np.flatnonzero(row_sum == 0))
        matrix_shape = ma.matrix.shape

    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)
    ma.matrix = ma.matrix.astype(np.float64, copy=True)

    log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype))
    log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype))
    log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype))

    # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices)))
    # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data)))
    # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr)))

    # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False)
    # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    total_filtered_out = set()
    if args.correctionMethod == 'ICE':
        if not args.filterThreshold:
            log.error('min and max filtering thresholds should be set')
            sys.exit(1)
        outlier_regions = filter_by_zscore(ma,
                                           args.filterThreshold[0],
                                           args.filterThreshold[1],
                                           perchr=args.perchr)
        # compute and print some statistics
        pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
        ma.printchrtoremove(outlier_regions,
                            label="Bins that are MAD outliers ({:.2f}%) "
                            "out of".format(pct_outlier, ma.matrix.shape[0]),
                            restore_masked_bins=False)

        assert matrix_shape == ma.matrix.shape
        # mask filtered regions
        ma.maskBins(outlier_regions)
        total_filtered_out = set(outlier_regions)

        if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
            chrom, _, _, coverage = zip(*ma.cut_intervals)

            assert type(coverage[0]) == np.float64

            failed_bins = np.flatnonzero(
                np.array(coverage) < args.sequencedCountCutoff)

            ma.printchrtoremove(failed_bins,
                                label="Bins with low coverage",
                                restore_masked_bins=False)
            ma.maskBins(failed_bins)
            total_filtered_out = set(failed_bins)
            """
            ma.matrix, to_remove = fill_gaps(ma, failed_bins)
            log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
            ma.maskBins(to_remove)
            """

        if args.transCutoff and 0 < args.transCutoff < 100:
            cutoff = float(args.transCutoff) / 100
            # a usual cutoff is 0.05
            ma.truncTrans(high=cutoff)
            pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()

    correction_factors = []
    corrected_matrix = lil_matrix(ma.matrix.shape)
    if args.perchr:
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                      chr_range[0]:chr_range[1]]
            if args.correctionMethod == 'ICE':
                _matrix, _corr_factors = iterative_correction(
                    chr_submatrix, args)
                corrected_matrix[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = _matrix
                correction_factors.append(_corr_factors)
            else:
                # Set the kr matrix along with its correction factors vector
                assert (args.correctionMethod == 'KR')
                log.debug("Loading a float sparse matrix for KR balancing")
                kr = kr_balancing(
                    chr_submatrix.shape[0], chr_submatrix.shape[1],
                    chr_submatrix.count_nonzero(),
                    chr_submatrix.indptr.astype(np.int64, copy=False),
                    chr_submatrix.indices.astype(np.int64, copy=False),
                    chr_submatrix.data.astype(np.float64, copy=False))
                kr.computeKR()
                if args.outFileName.endswith('.h5'):
                    corrected_matrix[
                        chr_range[0]:chr_range[1],
                        chr_range[0]:chr_range[1]] = kr.get_normalised_matrix(
                            True)
                # correction_factors.append(np.true_divide(1,
                #                                          kr.get_normalisation_vector(False).todense()))
                correction_factors.append(
                    kr.get_normalisation_vector(False).todense())

        correction_factors = np.concatenate(correction_factors)

    else:
        if args.correctionMethod == 'ICE':
            corrected_matrix, correction_factors = iterative_correction(
                ma.matrix, args)
            ma.setMatrixValues(corrected_matrix)
        else:
            assert (args.correctionMethod == 'KR')
            log.debug("Loading a float sparse matrix for KR balancing")
            kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1],
                              ma.matrix.count_nonzero(),
                              ma.matrix.indptr.astype(np.int64, copy=False),
                              ma.matrix.indices.astype(np.int64, copy=False),
                              ma.matrix.data.astype(np.float64, copy=False))
            log.debug('passed pointers')
            kr.computeKR()
            log.debug('computation done')

            # set it to False since the vector is already normalised
            # with the previous True
            # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense())
            correction_factors = kr.get_normalisation_vector(False).todense()

            if args.outFileName.endswith('.h5'):
                corrected_matrix = kr.get_normalised_matrix(True)

    if args.outFileName.endswith('.h5'):
        ma.setMatrixValues(corrected_matrix)
    # if
    ma.setCorrectionFactors(correction_factors)

    log.debug("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE':

        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(
            after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff),
                            restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)
        ma.maskBins(to_remove)
    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed",
                        restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)
def plot_total_contact_dist(hic_ma, args):
    """
    Plots the distribution of number of contacts (excluding self contacts)
    Outliers with a high number are removed for the plot

    :param hic_ma: sparse matrix
    :return:
    """
    use('Agg')

    majorlocator = MultipleLocator(1)
    majorformatter = FormatStrFormatter('%d')
    minorlocator = MultipleLocator(0.2)

    def plot_histogram(row_sum_values, mad_values, ax1, title=None):

        if args.xMax:
            ax1.set_xlim(ax1.get_xlim()[0], args.xMax)
            row_sum_values = row_sum_values[row_sum_values < args.xMax]

        ax1.set_xlabel("total counts per bin")
        ax1.set_ylabel("frequency")
        #    ax1.xaxis.grid(True)
        ax1.patch.set_visible(False)
        dist, bin_s, __ = ax1.hist(row_sum_values, 100, color='green')

        # add second axis on top
        ax2 = ax1.twiny()
        ax2.set_xlabel("modified z-score")
        ax2.xaxis.set_major_locator(majorlocator)
        ax2.xaxis.set_major_formatter(majorformatter)
        ax2.xaxis.grid(True, which='minor')
        # for the minor ticks, use no labels; default NullFormatter
        ax2.xaxis.set_minor_locator(minorlocator)

        # update second axis values by mapping the min max
        # of the main axis to the translated values
        # into modified z score.

        # workaround for 'Axis limits cannot be NaN or Inf' bug in version 2.1.1
        log.debug("ax1.get_xlim(): {}".format(ax1.get_xlim()))
        log.debug("np.array(ax1.get_xlim()): {}".format(
            np.array(ax1.get_xlim())))
        log.debug(
            "mad_values.value_to_mad(np.array(ax1.get_xlim())): {}".format(
                mad_values.value_to_mad(np.array(ax1.get_xlim()))))

        ax2.set_xlim(mad_values.value_to_mad(np.array(ax1.get_xlim())))

        # get first local mininum value
        local_min = [
            x for x, y in enumerate(dist)
            if 1 <= x < len(dist) - 1 and dist[x - 1] > y < dist[x + 1]
        ]

        if len(local_min) > 0:
            threshold = bin_s[local_min[0]]
        else:
            threshold = None

        if threshold:
            mad_threshold = mad_values.value_to_mad(threshold)
            ymin, ymax = ax2.get_ylim()
            ax2.vlines(mad_threshold, ymin, ymax)
            if title:
                log.info("{}: mad threshold {}".format(title, mad_threshold))
            else:
                log.info("mad threshold {}".format(mad_threshold))

    # replace nan by 0
    # hic_ma.matrix.data[np.isnan(hic_ma.matrix.data)] = 0
    hic_ma.matrix = convertNansToZeros(hic_ma.matrix)
    hic_ma.matrix = convertInfsToZeros(hic_ma.matrix)

    if args.perchr:
        chroms = hic_ma.getChrNames()
        if len(chroms) > 30:
            log.warning("The matrix contains {} chromosomes. It is not "
                        "practical to plot each. Try using --chromosomes to "
                        "select some chromosomes or plot a single histogram.")
        num_rows = int(np.ceil(float(len(chroms)) / 5))
        num_cols = min(len(chroms), 5)
        grids = gridspec.GridSpec(num_rows, num_cols)
        fig = plt.figure(figsize=(6 * num_cols, 5 * num_rows))
        ax = {}
        for plot_num, chrname in enumerate(chroms):
            log.info("Plotting chromosome {}".format(chrname))

            chr_range = hic_ma.getChrBinRange(chrname)
            chr_submatrix = hic_ma.matrix[chr_range[0]:chr_range[1],
                                          chr_range[0]:chr_range[1]]

            row_sum = np.asarray(chr_submatrix.sum(axis=1)).flatten()
            row_sum = row_sum - chr_submatrix.diagonal()
            mad = MAD(row_sum)
            modified_z_score = mad.get_motified_zscores()

            # high remove outliers
            row_sum = row_sum[modified_z_score < 5]

            col = plot_num % num_cols
            row = plot_num // num_cols
            ax[chrname] = fig.add_subplot(grids[row, col])

            plot_histogram(row_sum, mad, ax[chrname], title=chrname)
            ax[chrname].set_title(chrname)
    else:
        fig = plt.figure()
        row_sum = np.asarray(hic_ma.matrix.sum(axis=1)).flatten()
        row_sum = row_sum - hic_ma.matrix.diagonal()
        mad = MAD(row_sum)
        modified_z_score = mad.get_motified_zscores()

        # high remove outliers
        row_sum = row_sum[modified_z_score < 5]
        ax = fig.add_subplot(111)
        plot_histogram(row_sum, mad, ax)

    plt.tight_layout()
    plt.savefig(args.plotName)
    plt.close()
Beispiel #13
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        log.debug("Computing pca for chromosome: {}".format(chrname))

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]

        exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                   length_chromosome,
                                                   chromosome_count)
        exp_obs_matrix_ = convertNansToZeros(
            csr_matrix(exp_obs_matrix_)).todense()
        exp_obs_matrix_ = convertInfsToZeros(
            csr_matrix(exp_obs_matrix_)).todense()

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, chrom_ in enumerate(chrom_list):
            if old_chrom != chrom_:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = chrom_

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            chrom_list_ = []
            start_list_ = []
            end_list_ = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    chrom_list_.append(toString(chrom_list[i]))
                    start_list_.append(start_list[i])
                    end_list_.append(end_list[i])

            # write entries
            bw.addEntries(chrom_list_,
                          start_list_,
                          ends=end_list_,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
Beispiel #14
0
def _obs_exp_non_zero(pSubmatrix):

    obs_exp_matrix_ = obs_exp_matrix_non_zero(pSubmatrix)
    obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_))
    obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)).todense()
    return obs_exp_matrix_
Beispiel #15
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if args.verbose:
        log.setLevel(logging.INFO)

    # args.chromosomes
    if check_cooler(args.matrix) and args.chromosomes is not None and len(args.chromosomes) == 1:
        ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes))
    else:
        ma = hm.hiCMatrix(args.matrix)

        if args.chromosomes:
            ma.reorderChromosomes(toString(args.chromosomes))

    # mask all zero value bins
    row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    log.info("Removing {} zero value bins".format(sum(row_sum == 0)))
    ma.maskBins(np.flatnonzero(row_sum == 0))
    matrix_shape = ma.matrix.shape
    ma.matrix = convertNansToZeros(ma.matrix)
    ma.matrix = convertInfsToZeros(ma.matrix)

    if 'plotName' in args:
        plot_total_contact_dist(ma, args)
        log.info("Saving diagnostic plot {}\n".format(args.plotName))
        return

    log.info("matrix contains {} data points. Sparsity {:.3f}.".format(
        len(ma.matrix.data),
        float(len(ma.matrix.data)) / (ma.matrix.shape[0] ** 2)))

    if args.skipDiagonal:
        ma.diagflat(value=0)

    outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr)
    # compute and print some statistics
    pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
    ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) "
                                               "out of".format(pct_outlier, ma.matrix.shape[0]),
                        restore_masked_bins=False)

    assert matrix_shape == ma.matrix.shape
    # mask filtered regions
    ma.maskBins(outlier_regions)
    total_filtered_out = set(outlier_regions)

    if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1:
        chrom, _, _, coverage = zip(*ma.cut_intervals)

        assert type(coverage[0]) == np.float64

        failed_bins = np.flatnonzero(
            np.array(coverage) < args.sequencedCountCutoff)

        ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False)
        ma.maskBins(failed_bins)
        total_filtered_out = set(failed_bins)
        """
        ma.matrix, to_remove = fill_gaps(ma, failed_bins)
        log.warning("From {} failed bins, {} could "
                         "not be filled\n".format(len(failed_bins),
                                                  len(to_remove)))
        ma.maskBins(to_remove)
        """

    if args.transCutoff and 0 < args.transCutoff < 100:
        cutoff = float(args.transCutoff) / 100
        # a usual cutoff is 0.05
        ma.truncTrans(high=cutoff)

    pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()
    correction_factors = []
    if args.perchr:
        corrected_matrix = lil_matrix(ma.matrix.shape)
        # normalize each chromosome independently
        for chrname in list(ma.interval_trees):
            chr_range = ma.getChrBinRange(chrname)
            chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]]
            _matrix, _corr_factors = iterative_correction(chr_submatrix, args)
            corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix
            correction_factors.append(_corr_factors)
        correction_factors = np.concatenate(correction_factors)

    else:
        corrected_matrix, correction_factors = iterative_correction(ma.matrix, args)

    ma.setMatrixValues(corrected_matrix)
    ma.setCorrectionFactors(correction_factors)
    log.info("Correction factors {}".format(correction_factors[:10]))
    if args.inflationCutoff and args.inflationCutoff > 0:
        after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten()
        # identify rows that were expanded more than args.inflationCutoff times
        to_remove = np.flatnonzero(after_row_sum / pre_row_sum >= args.inflationCutoff)
        ma.printchrtoremove(to_remove,
                            label="inflated >={} "
                            "regions".format(args.inflationCutoff), restore_masked_bins=False)
        total_filtered_out = total_filtered_out.union(to_remove)

        ma.maskBins(to_remove)

    ma.printchrtoremove(sorted(list(total_filtered_out)),
                        label="Total regions to be removed", restore_masked_bins=False)

    ma.save(args.outFileName, pApplyCorrection=False)
def open_and_store_matrix(pMatrixName, pMatricesList, pIndex, pXDimension,
                          pChromosomes, pNorm, pExtraTrack, pHistonMarkType,
                          pBinarization, pQueue):
    compartments_matrix = None

    for i, matrix in enumerate(pMatricesList):

        ma = hm.hiCMatrix(pMatrixName + '::' + matrix)

        # WARNING
        # DO NOT APPLY BIN MASKING, WILL LEAD TO DIFFERENT SIZES OF THE CHROMOSOMES
        # THIS IS CAUSING A FAIL OF THE COMPUTATION
        # ma.maskBins(ma.nan_bins)
        k = 1
        if pChromosomes:
            ma.keepOnlyTheseChr(pChromosomes)

        vecs_list = []
        chrom_list = []
        start_list = []
        end_list = []
        # PCA is computed per chromosome
        length_chromosome = 0
        chromosome_count = len(ma.getChrNames())

        for chrname in ma.getChrNames():
            chr_range = ma.getChrBinRange(chrname)
            length_chromosome += chr_range[1] - chr_range[0]

        if pExtraTrack and (pExtraTrack.endswith('.bw')
                            or pExtraTrack.endswith('.bigwig')):
            bwTrack = pyBigWig.open(pExtraTrack, 'r')

        for chrname in ma.getChrNames():
            chr_range = ma.getChrBinRange(chrname)
            submatrix = ma.matrix[chr_range[0]:chr_range[1],
                                  chr_range[0]:chr_range[1]]
            if pNorm:
                obs_exp_matrix_ = obs_exp_matrix_norm(submatrix)

            else:
                obs_exp_matrix_ = obs_exp_matrix_lieberman(
                    submatrix, length_chromosome, chromosome_count)
            obs_exp_matrix_ = convertNansToZeros(
                csr_matrix(obs_exp_matrix_)).todense()
            obs_exp_matrix_ = convertInfsToZeros(
                csr_matrix(obs_exp_matrix_)).todense()

            pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_)
            pearson_correlation_matrix = convertNansToZeros(
                csr_matrix(pearson_correlation_matrix)).todense()
            pearson_correlation_matrix = convertInfsToZeros(
                csr_matrix(pearson_correlation_matrix)).todense()

            corrmatrix = np.cov(pearson_correlation_matrix)
            corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
            corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
            evals, eigs = linalg.eig(corrmatrix)

            chrom, start, end, _ = zip(
                *ma.cut_intervals[chr_range[0]:chr_range[1]])

            chrom_list += chrom
            start_list += start
            end_list += end
            if pExtraTrack and (pExtraTrack.endswith('.bw')
                                or pExtraTrack.endswith('.bigwig')):
                assert (len(end) == len(start))
                correlateEigenvectorWithHistonMarkTrack(
                    eigs[:, :k].transpose(), bwTrack, chrname, start, end,
                    pExtraTrack, pHistonMarkType)

            vecs_list += eigs[:, :k].tolist()
        if compartments_matrix is None:
            compartments_matrix = np.zeros(
                [pXDimension, len(np.array(vecs_list).flatten())],
                dtype=np.float)

        eigenvector = np.real(np.array(vecs_list).flatten())
        mask = np.isnan(eigenvector)
        if len(mask) > 0:
            eigenvector[mask] = 0
        mask = np.isinf(eigenvector)
        if len(mask) > 0:
            eigenvector[mask] = 0

        if pBinarization:
            mask = eigenvector <= 0
            eigenvector[mask] = -1
            mask = eigenvector > 0
            eigenvector[mask] = 1

        compartments_matrix[pIndex + i, :] = eigenvector

    pQueue.put(compartments_matrix)

    return