Esempio n. 1
0
def __obs_exp(pSubmatrix, pLengthChromosome, pChromosomeCount):

    exp_obs_matrix_ = exp_obs_matrix_lieberman(pSubmatrix, pLengthChromosome,
                                               pChromosomeCount)
    exp_obs_matrix_ = convertNansToZeros(csr_matrix(exp_obs_matrix_))
    exp_obs_matrix_ = convertInfsToZeros(csr_matrix(exp_obs_matrix_)).todense()
    return exp_obs_matrix_
Esempio n. 2
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    if args.pearsonMatrix:
        trasf_matrix_pearson = lil_matrix(ma.matrix.shape)

    if args.obsexpMatrix:
        trasf_matrix_obsexp = lil_matrix(ma.matrix.shape)

    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]
        if args.norm:
            exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome,
                                                  chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        else:
            exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                       length_chromosome,
                                                       chromosome_count)
            exp_obs_matrix_ = convertNansToZeros(
                csr_matrix(exp_obs_matrix_)).todense()
            exp_obs_matrix_ = convertInfsToZeros(
                csr_matrix(exp_obs_matrix_)).todense()

        if args.obsexpMatrix:
            trasf_matrix_obsexp[chr_range[0]:chr_range[1],
                                chr_range[0]:chr_range[1]] = lil_matrix(
                                    exp_obs_matrix_)

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()

        if args.pearsonMatrix:
            trasf_matrix_pearson[chr_range[0]:chr_range[1],
                                 chr_range[0]:chr_range[1]] = lil_matrix(
                                     pearson_correlation_matrix)

        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.pearsonMatrix:
        file_type = 'cool'
        if args.pearsonMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.pearsonMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.obsexpMatrix:
        file_type = 'cool'
        if args.obsexpMatrix.endswith('.h5'):
            file_type = 'h5'
        matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type)
        matrixFileHandlerOutput.set_matrix_variables(
            trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins,
            ma.correction_factors, ma.distance_counts)
        matrixFileHandlerOutput.save(args.obsexpMatrix,
                                     pSymmetric=True,
                                     pApplyCorrection=False)

    if args.geneTrack:
        vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list,
                                                      args.geneTrack)

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, _chrom in enumerate(chrom_list):
            if old_chrom != _chrom:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = _chrom

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            _chrom_list = []
            _start_list = []
            _end_list = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    _chrom_list.append(toString(chrom_list[i]))
                    _start_list.append(start_list[i])
                    _end_list.append(end_list[i])

            # write entries
            bw.addEntries(_chrom_list,
                          _start_list,
                          ends=_end_list,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)
Esempio n. 3
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    if int(args.numberOfEigenvectors) != len(args.outputFileName):
        log.error(
            "Number of output file names and number of eigenvectors does not match. Please"
            "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}"
            .format(args.outputFileName, args.numberOfEigenvectors))
        exit(1)

    ma = hm.hiCMatrix(args.matrix)
    ma.maskBins(ma.nan_bins)

    if args.chromosomes:
        ma.keepOnlyTheseChr(args.chromosomes)

    vecs_list = []
    chrom_list = []
    start_list = []
    end_list = []
    # PCA is computed per chromosome
    length_chromosome = 0
    chromosome_count = len(ma.getChrNames())
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        length_chromosome += chr_range[1] - chr_range[0]
    for chrname in ma.getChrNames():
        chr_range = ma.getChrBinRange(chrname)
        log.debug("Computing pca for chromosome: {}".format(chrname))

        submatrix = ma.matrix[chr_range[0]:chr_range[1],
                              chr_range[0]:chr_range[1]]

        exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix,
                                                   length_chromosome,
                                                   chromosome_count)
        exp_obs_matrix_ = convertNansToZeros(
            csr_matrix(exp_obs_matrix_)).todense()
        exp_obs_matrix_ = convertInfsToZeros(
            csr_matrix(exp_obs_matrix_)).todense()

        pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_)
        pearson_correlation_matrix = convertNansToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        pearson_correlation_matrix = convertInfsToZeros(
            csr_matrix(pearson_correlation_matrix)).todense()
        corrmatrix = np.cov(pearson_correlation_matrix)
        corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense()
        corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense()
        evals, eigs = linalg.eig(corrmatrix)
        k = args.numberOfEigenvectors

        chrom, start, end, _ = zip(
            *ma.cut_intervals[chr_range[0]:chr_range[1]])
        vecs_list += eigs[:, :k].tolist()

        chrom_list += chrom
        start_list += start
        end_list += end

    if args.format == 'bedgraph':
        for idx, outfile in enumerate(args.outputFileName):
            assert (len(vecs_list) == len(chrom_list))

            with open(outfile, 'w') as fh:
                for i, value in enumerate(vecs_list):
                    if len(value) == args.numberOfEigenvectors:
                        if isinstance(value[idx], np.complex):
                            value[idx] = value[idx].real
                        fh.write("{}\t{}\t{}\t{:.12f}\n".format(
                            toString(chrom_list[i]), start_list[i],
                            end_list[i], value[idx]))

    elif args.format == 'bigwig':
        if not pyBigWig.numpy == 1:
            log.error(
                "ERROR: Your version of pyBigWig is not supporting numpy: {}".
                format(pyBigWig.__file__))
            exit(1)
        old_chrom = chrom_list[0]
        header = []
        for i, chrom_ in enumerate(chrom_list):
            if old_chrom != chrom_:
                header.append((toString(old_chrom), end_list[i - 1]))
            old_chrom = chrom_

        header.append((toString(chrom_list[-1]), end_list[-1]))
        for idx, outfile in enumerate(args.outputFileName):
            log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list)))
            log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list)))

            assert (len(vecs_list) == len(chrom_list))
            chrom_list_ = []
            start_list_ = []
            end_list_ = []
            values = []

            bw = pyBigWig.open(outfile, 'w')
            # set big wig header
            bw.addHeader(header)
            # create entry lists
            for i, value in enumerate(vecs_list):
                # it can happen that some 'value' is having less dimensions than it should
                if len(value) == args.numberOfEigenvectors:
                    if isinstance(value[idx], np.complex):
                        value[idx] = value[idx].real
                    values.append(value[idx])
                    chrom_list_.append(toString(chrom_list[i]))
                    start_list_.append(start_list[i])
                    end_list_.append(end_list[i])

            # write entries
            bw.addEntries(chrom_list_,
                          start_list_,
                          ends=end_list_,
                          values=values)
            bw.close()
    else:
        log.error("Output format not known: {}".format(args.format))
        exit(1)