def main(args=None): args = parse_arguments().parse_args(args) if int(args.numberOfEigenvectors) != len(args.outputFileName): log.error("Number of output file names and number of eigenvectors" " does not match. Please" "provide the name of each file.\nFiles: {}\nNumber of " "eigenvectors: {}".format(args.outputFileName, args.numberOfEigenvectors)) exit(1) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) if args.ignoreMaskedBins: # ma.maskBins(ma.nan_bins) new_intervals = enlarge_bins(ma.cut_intervals) ma.setCutIntervals(new_intervals) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) if args.pearsonMatrix: transf_matrix_pearson = lil_matrix(ma.matrix.shape) if args.obsexpMatrix: transf_matrix_obsexp = lil_matrix(ma.matrix.shape) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')): bwTrack = pyBigWig.open(args.extraTrack, 'r') for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.method == 'lieberman': obs_exp_matrix_ = obs_exp_matrix_lieberman(submatrix, length_chromosome, chromosome_count) else: obs_exp_matrix_ = obs_exp_matrix_non_zero(submatrix, args.ligation_factor) obs_exp_matrix_ = csr_matrix(obs_exp_matrix_).todense() if args.obsexpMatrix: transf_matrix_obsexp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(obs_exp_matrix_) pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_) pearson_correlation_matrix = convertNansToZeros(csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros(csr_matrix(pearson_correlation_matrix)).todense() if args.pearsonMatrix: transf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(pearson_correlation_matrix) corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = pearson_correlation_matrix corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) k = args.numberOfEigenvectors chrom, start, end, _ = zip(*ma.cut_intervals[chr_range[0]:chr_range[1]]) chrom_list += chrom start_list += start end_list += end if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')): assert(len(end) == len(start)) correlateEigenvectorWithHistonMarkTrack(eigs[:, :k].transpose(), bwTrack, chrname, start, end, args.extraTrack, args.histonMarkType) vecs_list += eigs[:, :k].tolist() if args.pearsonMatrix: file_type = 'cool' if args.pearsonMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables(transf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.pearsonMatrix, pSymmetric=True, pApplyCorrection=False) if args.obsexpMatrix: file_type = 'cool' if args.obsexpMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables(transf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.obsexpMatrix, pSymmetric=True, pApplyCorrection=False) if args.extraTrack and not args.extraTrack.endswith('.bw') and not args.extraTrack.endswith('.bigwig'): vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.extraTrack) if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): assert(len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format(toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) elif args.format == 'bigwig': if not pyBigWig.numpy == 1: log.error("ERROR: Your version of pyBigWig is not supporting " "numpy: {}".format(pyBigWig.__file__)) exit(1) old_chrom = chrom_list[0] header = [] for i, _chrom in enumerate(chrom_list): if old_chrom != _chrom: header.append((toString(old_chrom), end_list[i - 1])) old_chrom = _chrom header.append((toString(chrom_list[-1]), end_list[-1])) for idx, outfile in enumerate(args.outputFileName): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) assert(len(vecs_list) == len(chrom_list)) _chrom_list = [] _start_list = [] _end_list = [] values = [] bw = pyBigWig.open(outfile, 'w') # set big wig header bw.addHeader(header) # create entry lists for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real values.append(value[idx]) _chrom_list.append(toString(chrom_list[i])) _start_list.append(start_list[i]) _end_list.append(end_list[i]) # write entries bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values) bw.close() else: log.error("Output format not known: {}".format(args.format)) exit(1)
def main(args=None): args = parse_arguments().parse_args(args) if args.title: args.title = remove_non_ascii(args.title) chrom = None start_pos1 = None chrom2 = None start_pos2 = None if args.perChromosome and args.region: log.error('ERROR, choose from the option ' '--perChromosome or --region, the two ' 'options at the same time are not ' 'compatible.') exit(1) # if args.region and args.region2 and args.bigwig: # log.error("Inter-chromosomal pca is not supported.") # exit(1) # is_cooler = False # if args.matrix.endswith('.cool') or cooler.io.is_cooler(args.matrix) or'.mcool' in args.matrix: is_cooler = check_cooler(args.matrix) log.debug("Cooler or no cooler: {}".format(is_cooler)) open_cooler_chromosome_order = True if args.chromosomeOrder is not None and len(args.chromosomeOrder) > 1: open_cooler_chromosome_order = False if is_cooler and not args.region2 and open_cooler_chromosome_order: log.debug("Retrieve data from cooler format and use its benefits.") regionsToRetrieve = None if args.region: regionsToRetrieve = [] regionsToRetrieve.append(args.region) # if args.region2: # chrom2, region_start2, region_end2 = translate_region(args.region2) # regionsToRetrieve.append(args.region2) if args.chromosomeOrder: args.region = None args.region2 = None regionsToRetrieve = args.chromosomeOrder ma = HiCMatrix.hiCMatrix(args.matrix, pChrnameList=regionsToRetrieve) log.debug('Shape {}'.format(ma.matrix.shape)) if args.clearMaskedBins: ma.maskBins(ma.nan_bins) # to avoid gaps in the plot, bins flanking the masked bins # are enlarged new_intervals = enlarge_bins(ma.cut_intervals) ma.setCutIntervals(new_intervals) if args.region: chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, ma) matrix = np.asarray(ma.matrix.todense().astype(float)) matrix_length = len(matrix[0]) log.debug("Number of data points matrix_cool: {}".format(matrix_length)) else: ma = HiCMatrix.hiCMatrix(args.matrix) if args.clearMaskedBins: ma.maskBins(ma.nan_bins) new_intervals = enlarge_bins(ma.cut_intervals) ma.setCutIntervals(new_intervals) if args.chromosomeOrder: args.region = None args.region2 = None valid_chromosomes = [] invalid_chromosomes = [] log.debug('args.chromosomeOrder: {}'.format(args.chromosomeOrder)) log.debug("ma.chrBinBoundaries {}".format(ma.chrBinBoundaries)) if sys.version_info[0] == 3: args.chromosomeOrder = toBytes(args.chromosomeOrder) for chrom in toString(args.chromosomeOrder): if chrom in ma.chrBinBoundaries: valid_chromosomes.append(chrom) else: invalid_chromosomes.append(chrom) if len(invalid_chromosomes) > 0: log.warning("WARNING: The following chromosome/scaffold names were not found. Please check" "the correct spelling of the chromosome names. \n") log.warning("\n".join(invalid_chromosomes)) ma.reorderChromosomes(valid_chromosomes) log.info("min: {}, max: {}\n".format(ma.matrix.data.min(), ma.matrix.data.max())) if args.region: chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, ma) matrix = np.asarray(ma.matrix[idx1, :][:, idx2].todense().astype(float)) else: log.debug("Else branch") matrix = np.asarray(ma.getMatrix().astype(float)) matrix_length = len(matrix[0]) log.debug("Number of data points matrix: {}".format(matrix_length)) for matrix_ in matrix: if not matrix_length == len(matrix_): log.error("Matrices do not have the same length: {} , {}".format(matrix_length, len(matrix_))) cmap = cm.get_cmap(args.colorMap) log.debug("Nan values set to black\n") cmap.set_bad('black') bigwig_info = None if args.bigwig: bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': ma.nan_bins} if args.perChromosome: fig = plotPerChr(ma, cmap, args, pBigwig=bigwig_info) else: norm = None if args.log or args.log1p: mask = matrix == 0 matrix[mask] = np.nanmin(matrix[mask == False]) if np.isnan(matrix).any() or np.isinf(matrix).any(): log.debug("any nan {}".format(np.isnan(matrix).any())) log.debug("any inf {}".format(np.isinf(matrix).any())) mask_nan = np.isnan(matrix) mask_inf = np.isinf(matrix) matrix[mask_nan] = np.nanmin(matrix[mask_nan == False]) matrix[mask_inf] = np.nanmin(matrix[mask_inf == False]) log.debug("any nan after remove of nan: {}".format(np.isnan(matrix).any())) log.debug("any inf after remove of inf: {}".format(np.isinf(matrix).any())) if args.log1p: matrix += 1 norm = LogNorm() elif args.log: norm = LogNorm() if args.bigwig: # increase figure height to accommodate bigwig track fig_height = 8.5 else: fig_height = 7 height = 4.8 / fig_height fig_width = 8 width = 5.0 / fig_width left_margin = (1.0 - width) * 0.5 fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi) if args.bigwig: gs = gridspec.GridSpec(2, 2, height_ratios=[0.90, 0.1], width_ratios=[0.97, 0.03]) gs.update(hspace=0.05, wspace=0.05) ax1 = plt.subplot(gs[0, 0]) ax2 = plt.subplot(gs[1, 0]) ax3 = plt.subplot(gs[0, 1]) bigwig_info['axis'] = ax2 bigwig_info['axis_colorbar'] = ax3 else: ax1 = None bottom = 1.3 / fig_height if start_pos1 is None: start_pos1 = make_start_pos_array(ma) position = [left_margin, bottom, width, height] plotHeatmap(matrix, ma.get_chromosome_sizes(), fig, position, args, cmap, xlabel=chrom, ylabel=chrom2, start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=ax1, pBigwig=bigwig_info) if not args.disable_tight_layout: if args.perChromosome or args.bigwig: try: plt.tight_layout() except UserWarning: log.info("Failed to tight layout. Using regular plot.") except ValueError: log.info("Failed to tight layout. Using regular plot.") plt.savefig(args.outFileName, dpi=args.dpi) plt.close(fig)
def compute_spectra_matrix(args, matrix=None): if args.maxDepth is not None and args.minDepth is not none and args.maxDepth <= args.minDepth: exit("Please check that maxDepth is larger than minDepth.") global hic_ma if matrix is not None: hic_ma = matrix else: hic_ma = hm.hiCMatrix(args.matrix) # remove self counts log.info('removing diagonal values\n') hic_ma.diagflat(value=0) # mask bins without any information hic_ma.maskBins(hic_ma.nan_bins) orig_intervals = hic_ma.cut_intervals binsize = hic_ma.getBinSize() if args.maxDepth is None: if binsize < 1000: args.maxDepth = binsize * 60 elif 1000 <= binsize < 20000: args.maxDepth = binsize * 20 else: args.maxDepth = binsize * 10 elif args.maxDepth < binsize * 5: sys.error( "Please specify a --maxDepth that is at least 5 times larger than the matrix bin size" ) exit(1) if args.minDepth is None: if binsize < 1000: args.minDepth = binsize * 30 elif 1000 <= binsize < 20000: args.minDepth = binsize * 10 else: args.minDepth = binsize * 5 elif args.minDepth < binsize * 3: log.error( "Please specify a --minDepth that is at least 3 times larger than the matrix bin size" ) exit(1) if args.step is None: if binsize < 1000: args.step = binsize * 4 else: args.step = binsize * 2 elif args.step < binsize: log.error( "Please specify a --step that is at least the size of the matrix bin size" ) exit(1) args.binsize = binsize print_args(args) # use zscore matrix log.info("Computing z-score matrix...\n") hic_ma.convert_to_zscore_matrix(maxdepth=args.maxDepth * 2.5, perchr=True) # extend remaining bins to remove gaps in # the matrix new_intervals = enlarge_bins(hic_ma.cut_intervals) # rebuilt bin positions if necessary if new_intervals != orig_intervals: hic_ma.interval_trees, hic_ma.chrBinBoundaries = \ hic_ma.intervalListToIntervalTree(new_intervals) hic_ma.cut_intervals = new_intervals hic_ma.orig_bin_ids = None hic_ma.nan_bins = None hic_ma.save(args.outFileName + "_zscore_matrix.h5") if args.minDepth % hic_ma.getBinSize() != 0: log.warn('Warning. specified *depth* is not multiple of the ' 'hi-c matrix bin size ({})\n'.format(hic_ma.getBinSize())) if args.step % hic_ma.getBinSize() != 0: log.warn('Warning. specified *step* is not multiple of the ' 'hi-c matrix bin size ({})\n'.format(hic_ma.getBinSize())) binsize = hic_ma.getBinSize() log.info("Computing TAD-separation scores...\n") min_depth_in_bins = int(args.minDepth / binsize) max_depth_in_bins = int(args.maxDepth / binsize) step_in_bins = int(args.step / binsize) if step_in_bins == 0: exit("Please select a step size larger than {}".format(binsize)) incremental_step = get_incremental_step_size(args.minDepth, args.maxDepth, args.step) log.info("computing spectrum for window sizes between {} ({} bp)" "and {} ({} bp) at the following window sizes {} {}\n".format( min_depth_in_bins, binsize * min_depth_in_bins, max_depth_in_bins, binsize * max_depth_in_bins, step_in_bins, incremental_step)) if min_depth_in_bins <= 1: log.error( 'ERROR\nminDepth length too small. Use a value that is at least ' 'twice as large as the bin size which is: {}\n'.format(binsize)) exit(0) if max_depth_in_bins <= 1: log.error( 'ERROR\nmaxDepth length too small. Use a value that is larger ' 'than the bin size which is: {}\n'.format(binsize)) exit(0) # work only with the upper matrix # and remove all pixels that are beyond # 2 * max_depth_in_bis which are not required # (this is done by subtracting a second sparse matrix # that contains only the upper matrix that wants to be removed. limit = 2 * max_depth_in_bins hic_ma.matrix = sparse.triu(hic_ma.matrix, k=0, format='csr') - sparse.triu( hic_ma.matrix, k=limit, format='csr') hic_ma.matrix.eliminate_zeros() num_processors = args.numberOfProcessors func = compute_matrix_wrapper TASKS = [] bins_to_consider = [] for chrom in hic_ma.chrBinBoundaries.keys(): bins_to_consider.extend(range(*hic_ma.chrBinBoundaries[chrom])) for idx_array in np.array_split(bins_to_consider, num_processors): TASKS.append((idx_array, args.minDepth, args.maxDepth, args.step)) if num_processors > 1: pool = multiprocessing.Pool(num_processors) log.info("Using {} processors\n".format(num_processors)) res = pool.map_async(func, TASKS).get(9999999) else: res = map(func, TASKS) chrom = [] chr_start = [] chr_end = [] matrix = [] for _chrom, _chr_start, _chr_end, _matrix in res: chrom.extend(_chrom) chr_start.extend(_chr_start) chr_end.extend(_chr_end) matrix.append(_matrix) matrix = np.vstack(matrix) return np.array(chrom), np.array(chr_start).astype(int), np.array( chr_end).astype(int), matrix