def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount): obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)).todense() return obs_exp_matrix_
def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount): obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)) # if len(obs_exp_matrix_.data) == 0: # return np.array() return obs_exp_matrix_ # .todense()
def main(args=None): args = parse_arguments().parse_args(args) if int(args.numberOfEigenvectors) != len(args.outputFileName): log.error("Number of output file names and number of eigenvectors" " does not match. Please" "provide the name of each file.\nFiles: {}\nNumber of " "eigenvectors: {}".format(args.outputFileName, args.numberOfEigenvectors)) exit(1) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) if args.ignoreMaskedBins: # ma.maskBins(ma.nan_bins) new_intervals = enlarge_bins(ma.cut_intervals) ma.setCutIntervals(new_intervals) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) if args.pearsonMatrix: transf_matrix_pearson = lil_matrix(ma.matrix.shape) if args.obsexpMatrix: transf_matrix_obsexp = lil_matrix(ma.matrix.shape) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')): bwTrack = pyBigWig.open(args.extraTrack, 'r') for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.method == 'lieberman': obs_exp_matrix_ = obs_exp_matrix_lieberman(submatrix, length_chromosome, chromosome_count) else: obs_exp_matrix_ = obs_exp_matrix_non_zero(submatrix, args.ligation_factor) obs_exp_matrix_ = csr_matrix(obs_exp_matrix_).todense() if args.obsexpMatrix: transf_matrix_obsexp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(obs_exp_matrix_) pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_) pearson_correlation_matrix = convertNansToZeros(csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros(csr_matrix(pearson_correlation_matrix)).todense() if args.pearsonMatrix: transf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(pearson_correlation_matrix) corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = pearson_correlation_matrix corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) k = args.numberOfEigenvectors chrom, start, end, _ = zip(*ma.cut_intervals[chr_range[0]:chr_range[1]]) chrom_list += chrom start_list += start end_list += end if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')): assert(len(end) == len(start)) correlateEigenvectorWithHistonMarkTrack(eigs[:, :k].transpose(), bwTrack, chrname, start, end, args.extraTrack, args.histonMarkType) vecs_list += eigs[:, :k].tolist() if args.pearsonMatrix: file_type = 'cool' if args.pearsonMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables(transf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.pearsonMatrix, pSymmetric=True, pApplyCorrection=False) if args.obsexpMatrix: file_type = 'cool' if args.obsexpMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables(transf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.obsexpMatrix, pSymmetric=True, pApplyCorrection=False) if args.extraTrack and not args.extraTrack.endswith('.bw') and not args.extraTrack.endswith('.bigwig'): vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.extraTrack) if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): assert(len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format(toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) elif args.format == 'bigwig': if not pyBigWig.numpy == 1: log.error("ERROR: Your version of pyBigWig is not supporting " "numpy: {}".format(pyBigWig.__file__)) exit(1) old_chrom = chrom_list[0] header = [] for i, _chrom in enumerate(chrom_list): if old_chrom != _chrom: header.append((toString(old_chrom), end_list[i - 1])) old_chrom = _chrom header.append((toString(chrom_list[-1]), end_list[-1])) for idx, outfile in enumerate(args.outputFileName): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) assert(len(vecs_list) == len(chrom_list)) _chrom_list = [] _start_list = [] _end_list = [] values = [] bw = pyBigWig.open(outfile, 'w') # set big wig header bw.addHeader(header) # create entry lists for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real values.append(value[idx]) _chrom_list.append(toString(chrom_list[i])) _start_list.append(start_list[i]) _end_list.append(end_list[i]) # write entries bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values) bw.close() else: log.error("Output format not known: {}".format(args.format)) exit(1)
def open_and_store_matrix(pMatrixName, pMatricesList, pIndex, pXDimension, pChromosomes, pNorm, pExtraTrack, pHistonMarkType, pBinarization, pQueue): compartments_matrix = None for i, matrix in enumerate(pMatricesList): ma = hm.hiCMatrix(pMatrixName + '::' + matrix) # WARNING # DO NOT APPLY BIN MASKING, WILL LEAD TO DIFFERENT SIZES OF THE CHROMOSOMES # THIS IS CAUSING A FAIL OF THE COMPUTATION # ma.maskBins(ma.nan_bins) k = 1 if pChromosomes: ma.keepOnlyTheseChr(pChromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] if pExtraTrack and (pExtraTrack.endswith('.bw') or pExtraTrack.endswith('.bigwig')): bwTrack = pyBigWig.open(pExtraTrack, 'r') for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if pNorm: obs_exp_matrix_ = obs_exp_matrix_norm(submatrix) else: obs_exp_matrix_ = obs_exp_matrix_lieberman( submatrix, length_chromosome, chromosome_count) obs_exp_matrix_ = convertNansToZeros( csr_matrix(obs_exp_matrix_)).todense() obs_exp_matrix_ = convertInfsToZeros( csr_matrix(obs_exp_matrix_)).todense() pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) chrom_list += chrom start_list += start end_list += end if pExtraTrack and (pExtraTrack.endswith('.bw') or pExtraTrack.endswith('.bigwig')): assert (len(end) == len(start)) correlateEigenvectorWithHistonMarkTrack( eigs[:, :k].transpose(), bwTrack, chrname, start, end, pExtraTrack, pHistonMarkType) vecs_list += eigs[:, :k].tolist() if compartments_matrix is None: compartments_matrix = np.zeros( [pXDimension, len(np.array(vecs_list).flatten())], dtype=np.float) eigenvector = np.real(np.array(vecs_list).flatten()) mask = np.isnan(eigenvector) if len(mask) > 0: eigenvector[mask] = 0 mask = np.isinf(eigenvector) if len(mask) > 0: eigenvector[mask] = 0 if pBinarization: mask = eigenvector <= 0 eigenvector[mask] = -1 mask = eigenvector > 0 eigenvector[mask] = 1 compartments_matrix[pIndex + i, :] = eigenvector pQueue.put(compartments_matrix) return