def main(): args = parse_arguments().parse_args() for matrix in args.matrices: # if hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = hic_ma.matrix.sum() / 2 bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() chromosomes = list(hic_ma.chrBinBoundaries) if args.outFileName: with open(args.outFileName, 'w') as file: file.write( "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n" .format(__version__)) file.write("File:\t{}\n".format(matrix)) file.write("Size:\t{:,}\n".format(size)) file.write("Sum:\t{:,}\n".format(sum_elements)) file.write("Bin_length:\t{}\n".format(bin_length)) file.write("Chromosomes:\t{}\n".format(", ".join( toString(chromosomes)))) file.write("Non-zero elements:\t{:,}\n".format(num_non_zero)) file.write("Minimum (non zero):\t{}\n".format(min_non_zero)) file.write("Maximum:\t{}\n".format(max_non_zero)) file.write("NaN bins:\t{}\n".format(num_nan_bins)) if check_cooler(matrix): file.write( 'The following columns are available: {}'.format( hic_ma.getInformationCoolerBinNames())) else: print("File:\t{}".format(matrix)) print("Size:\t{:,}".format(size)) print("Sum:\t{:,}".format(sum_elements)) print("Bin_length:\t{}".format(bin_length)) print("Chromosomes:\t{}".format(", ".join(toString(chromosomes)))) print("Non-zero elements:\t{:,}".format(num_non_zero)) print("Minimum (non zero):\t{}".format(min_non_zero)) print("Maximum:\t{}".format(max_non_zero)) print("NaN bins:\t{}".format(num_nan_bins)) if check_cooler(matrix): print('The following columns are available: {}'.format( hic_ma.getInformationCoolerBinNames()))
def plotLongRangeContacts(pAxis, pNameOfLongRangeContactsFile, pHiCMatrix, pRegion, pChromosomeOrder): x_list = [] y_list = [] log.debug('pRegion {}'.format(pRegion)) with open(pNameOfLongRangeContactsFile, 'rb') as file: for line in file.readlines(): line = toString(line) fields = line.strip().split('\t') try: chrom_X, start_X, end_X = fields[0:3] chrom_Y, start_Y, end_Y = fields[3:6] if pRegion is not None and (chrom_X != pRegion[0] or chrom_Y != pRegion[0]): continue elif pChromosomeOrder is not None and (chrom_X not in pChromosomeOrder or chrom_Y not in pChromosomeOrder): continue x = int(start_X) y = int(start_Y) log.debug('x {} y {}'.format(x, y)) if x >= int(pRegion[1]) and x <= int(pRegion[2]): if y >= int(pRegion[1]) and y <= int(pRegion[2]): x_list.append(x) y_list.append(y) except Exception: pass if pRegion is not None and (int(pRegion[1]) != 0 and int(pRegion[2]) != 1e15): pAxis.set_xlim(int(pRegion[1]), int(pRegion[2])) pAxis.set_ylim(int(pRegion[1]), int(pRegion[2])) pAxis.plot(x_list, y_list, 's', lw=2, markerfacecolor='none', markeredgecolor='red')
def main(): args = parse_arguments().parse_args() for matrix in args.matrices: hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = hic_ma.matrix.sum() / 2 bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() if not matrix.endswith("lieberman"): log.debug("lieberman matrix") chromosomes = list(hic_ma.chrBinBoundaries) print("File:\t{}".format(matrix)) print("Size:\t{:,}".format(size)) print("Sum:\t{:,}".format(sum_elements)) print("Bin_length:\t{}".format(bin_length)) print("Chromosomes:\t{}".format(", ".join(toString(chromosomes)))) print("Non-zero elements:\t{:,}".format(num_non_zero)) print("Minimum (non zero):\t{}".format(min_non_zero)) print("Maximum:\t{}".format(max_non_zero)) print("NaN bins:\t{}".format(num_nan_bins))
def main(): args = parse_arguments().parse_args() for matrix in args.matrices: # if hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = hic_ma.matrix.sum() / 2 bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() chromosomes = list(hic_ma.chrBinBoundaries) print("File:\t{}".format(matrix)) print("Size:\t{:,}".format(size)) print("Sum:\t{:,}".format(sum_elements)) print("Bin_length:\t{}".format(bin_length)) print("Chromosomes:\t{}".format(", ".join(toString(chromosomes)))) print("Non-zero elements:\t{:,}".format(num_non_zero)) print("Minimum (non zero):\t{}".format(min_non_zero)) print("Maximum:\t{}".format(max_non_zero)) print("NaN bins:\t{}".format(num_nan_bins)) if check_cooler(matrix): hic_ma.getInformationCoolerBinNames()
def get_no_comment_line(self): """ Skips comment lines starting with '#' "track" or "browser" in the bed files :return: """ line = next(self.file_handle) line = toString(line) if line.startswith("#") or line.startswith("track") or \ line.startswith("browser") or line.strip() == '': line = self.get_no_comment_line() self.line_number += 1 return line
def change_chrom_names(chrom): """ Changes UCSC chromosome names to ensembl chromosome names and vice versa. """ # TODO: mapping from chromosome names like mithocondria is missing chrom = toString(chrom) if chrom.startswith('chr'): # remove the chr part from chromosome name chrom = chrom[3:] else: # prefix with 'chr' the chromosome name chrom = 'chr' + chrom return chrom
def get_boundary_bin_id(hic, bed_fh): """ :param hic: HiCMatrix object :param bed_fh: file handle of the bed file :return: Sorted list of bin indices. """ line_number = 0 boundaries = set() for line in bed_fh.readlines(): line_number += 1 line = toString(line) if line.startswith('browser') or line.startswith( 'track') or line.startswith('#'): continue try: chrom, start, end = line.strip().split('\t')[0:3] except Exception as detail: msg = 'Could not read line\n{}\n. {}'.format(line, detail) log.exception(msg) sys.exit() try: start = int(start) end = int(end) except ValueError as detail: msg = "Error reading line: {}. One of the fields is not " \ "an integer.\nError message: {}".format(line_number, detail) log.exception(msg) sys.exit() assert start <= end, "Error in line #{}, end1 larger than start1 in {}".format( line_number, line) # check the overlap of the region with the hic matrix bins start_bin, end_bin = hic.getRegionBinRange(chrom, start, end) boundaries.add(start_bin) boundaries.add(end_bin) return np.sort(list(boundaries))
def plotLongRangeContacts(pAxis, pNameOfLongRangeContactsFile, pHiCMatrix, pRegion): x_list = [] y_list = [] with open(pNameOfLongRangeContactsFile, 'rb') as file: for line in file.readlines(): line = toString(line) fields = line.strip().split('\t') try: chrom_X, start_X, end_X = fields[0:3] chrom_Y, start_Y, end_Y = fields[3:6] if chrom_X != pRegion[0] or chrom_Y != pRegion[0]: continue x = int(start_X) y = int(start_Y) x_list.append(x) y_list.append(y) except Exception: pass pAxis.set_xlim(int(pRegion[1]), int(pRegion[2])) pAxis.set_ylim(int(pRegion[1]), int(pRegion[2])) pAxis.plot(x_list, y_list, 's', lw=2, markerfacecolor='none', markeredgecolor='red')
def __init__(self, file_handle): """ :param file_handle: file handle :return: """ self.file_type = None self.file_handle = file_handle self.line_number = 0 # guess file type fields = self.get_no_comment_line() fields = toString(fields) fields = fields.split('\t') self.guess_file_type(fields) self.file_handle.seek(0) self.prev_chrom = None self.prev_start = -1 self.prev_line = None # list of bed fields self.fields = [ 'chromosome', 'start', 'end', 'name', 'score', 'strand', 'thick_start', 'thick_end', 'rgb', 'block_count', 'block_sizes', 'block_starts' ] if self.file_type == 'bed12': self.BedInterval = collections.namedtuple('BedInterval', self.fields) elif self.file_type == 'bed9': self.BedInterval = collections.namedtuple('BedInterval', self.fields[:9]) else: self.BedInterval = collections.namedtuple('BedInterval', self.fields[:6])
def plotEigenvector(pAxis, pNameOfEigenvectorsList, pChromosomeList=None, pRegion=None, pXticks=None): log.debug('plotting eigenvector') pAxis.set_frame_on(False) file_format = pNameOfEigenvectorsList[0].split(".")[-1] if file_format != 'bedgraph' and file_format != 'bigwig' and file_format != 'bw': log.error("Given eigenvector files are not bedgraph or bigwig") exit() for eigenvector in pNameOfEigenvectorsList: if eigenvector.split('.')[-1] != file_format: log.error("Eigenvector input files have different formats.") exit() if pRegion: chrom, region_start, region_end = pRegion x = None eigenvector = None if file_format == "bigwig" or file_format == 'bw': for i, eigenvectorFile in enumerate(pNameOfEigenvectorsList): bw = pyBigWig.open(eigenvectorFile) eigenvector = [] if pChromosomeList: for chrom in pChromosomeList: try: bins_list = bw.intervals(toString(chrom)) except Exception: log.info( "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty." .format(chrom)) return if bins_list is None: log.info( "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty." .format(chrom)) return for i, bin_ in enumerate(bins_list): if i == 0: region_start = bin_[0] eigenvector.append(complex(bin_[2]).real) region_end = bins_list[-1][1] x = np.arange(0, len(eigenvector), 1) pAxis.set_xlim(0, len(eigenvector)) elif pRegion: try: if region_start == 0 and region_end == 1e15: log.debug("chrom == pRegion") bins_list = bw.intervals(toString(chrom)) region_start = bins_list[0][0] region_end = bins_list[-1][1] else: log.debug( "chrom: {}, region_start: {}, region_end: {}". format(chrom, region_start, region_end)) log.debug("pRegion: {}".format(pRegion)) bins_list = bw.intervals(chrom, region_start, region_end) except Exception: log.info( "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty." .format(chrom)) return if bins_list is None: log.info( "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty." .format(chrom)) return for bin_ in bins_list: eigenvector.append(complex(bin_[2]).real) step = (region_end * 2 - region_start) // len(eigenvector) x = np.arange(region_start, region_end * 2, int(step)) while len(x) < len(eigenvector): x = np.append(x[-1] + int(step)) while len(eigenvector) < len(x): x = x[:-1] pAxis.set_xlim(region_start, region_end * 2) else: for i, eigenvectorFile in enumerate(pNameOfEigenvectorsList): interval_tree, min_value, max_value = file_to_intervaltree( eigenvectorFile) eigenvector = [] if pChromosomeList: for chrom in pChromosomeList: if toString(chrom) not in interval_tree: log.info( "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty." .format(chrom)) return for i, region in enumerate( sorted(interval_tree[toString(chrom)])): if i == 0: region_start = region[0] region_end = region[1] eigenvector.append(complex(region.data[0]).real) x = np.arange(0, len(eigenvector), 1) pAxis.set_xlim(0, len(eigenvector)) elif pRegion: if toString(chrom) not in interval_tree: log.info( "Chromosome with no entry in the eigenvector found. Please exclude it from the matrix: {}. The eigenvector is left empty." .format(chrom)) return for region in sorted(interval_tree[toString(chrom)] [region_start:region_end]): eigenvector.append(float(region.data[0])) step = (region_end * 2 - region_start) // len(eigenvector) x = np.arange(region_start, region_end * 2, int(step)) while len(x) < len(eigenvector): x = np.append(x[-1] + int(step)) while len(eigenvector) < len(x): x = x[:-1] pAxis.set_xlim(region_start, region_end * 2) if x is not None and eigenvector is not None: pAxis.fill_between(x, 0, eigenvector, edgecolor='none') pAxis.get_xaxis().set_visible(False)
def main(args=None): args = parse_arguments().parse_args(args) log.warning('This tool is deprecated. Please use chicViewpoint, chicViewpointBackgroundModel and chicPlotViewpoint.') if args.region: args.region = args.region.replace(",", "") args.region = args.region.replace(";", "") args.region = args.region.replace("!", "") args.region = args.region.replace("-", ":") region = args.region.split(":") if len(region) != 3: log.error("Region format is invalid {}".format(args.region)) exit(0) chrom, region_start, region_end = region[0], int(region[1]), int(region[2]) args.referencePoint = args.referencePoint.replace(",", "") args.referencePoint = args.referencePoint.replace(";", "") args.referencePoint = args.referencePoint.replace("!", "") args.referencePoint = args.referencePoint.replace("-", ":") referencePoint = args.referencePoint.split(":") data_list = [] interactions_list = None if args.interactionOutFileName is not None: interactions_list = [] matrix_name_legend = [] for matrix in args.matrix: view_point_start, view_point_end, view_point_range, data_list_, interactions_list_ \ = getViewpointValues(matrix, referencePoint, chrom, region_start, region_end, args.interactionOutFileName, args.chromosome) data_list.append(data_list_) if args.interactionOutFileName is not None: interactions_list.append(interactions_list_) matrix_name_legend.append(os.path.basename(matrix)) fig = plt.figure(figsize=(6.4, 4.8)) ax = plt.subplot(111) matrices_plot_legend = [] for i, data in enumerate(data_list): matrices_plot_legend.append(ax.plot(range(len(data)), data, alpha=0.7, label=matrix_name_legend[i])[0]) if len(referencePoint) == 2: log.debug("Single reference point mode: {}".format(referencePoint)) log.debug("label 0: {}".format((int(referencePoint[1]) - region_start) * (-1))) log.debug("referencePoint[1]: {}".format(referencePoint[1])) log.debug("region_start: {}".format(region_start)) log.debug("label 1: {}".format(referencePoint[0] + ":" + relabelTicks(int(referencePoint[1])))) log.debug("label 2: {}".format(region_end - int(referencePoint[1]))) ax.set_xticks([0, view_point_start - view_point_range[0], view_point_range[1] - view_point_range[0]]) xticklabels = [None] * 3 xticklabels[0] = relabelTicks((int(referencePoint[1]) - region_start) * (-1)) xticklabels[1] = referencePoint[0] + ":" + relabelTicks(int(referencePoint[1])) xticklabels[2] = relabelTicks(region_end - int(referencePoint[1])) elif len(referencePoint) == 3: log.debug("Range mode: {}".format(referencePoint)) # fit scale: start coordinate is 0 --> view_point_range[0] ax.set_xticks([0, view_point_start - view_point_range[0], view_point_end - view_point_range[0], view_point_range[1] - view_point_range[0]]) xticklabels = [None] * 4 xticklabels[0] = relabelTicks((int(referencePoint[1]) - region_start) * (-1)) xticklabels[1] = referencePoint[0] + ":" + relabelTicks(int(referencePoint[1])) xticklabels[2] = referencePoint[0] + ":" + relabelTicks(int(referencePoint[2])) xticklabels[3] = relabelTicks(region_end - int(referencePoint[1])) ax.set_xticklabels(xticklabels) ax.set_ylabel('Number of interactions') # left, width = .45, .5 # bottom, height = .25, .7 # right = left + width # top = bottom + height plt.legend(handles=matrices_plot_legend) plt.savefig(args.outFileName, dpi=args.dpi) plt.close(fig) if interactions_list is not None: for i, interactions_list_ in enumerate(interactions_list): with open(args.interactionOutFileName + '_' + matrix_name_legend[i] + '.bedgraph', 'w') as fh: for interaction in interactions_list_: fh.write("{}\t{}\t{}\t{}\t{}\t{}\t{:.12f}\n".format(toString(interaction[0]), toString(interaction[1]), toString(interaction[2]), toString(interaction[3]), toString(interaction[4]), toString(interaction[5]), float(interaction[6])))
def main(args=None): args = parse_arguments().parse_args(args) if int(args.numberOfEigenvectors) != len(args.outputFileName): log.error( "Number of output file names and number of eigenvectors does not match. Please" "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}" .format(args.outputFileName, args.numberOfEigenvectors)) exit(1) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) if args.pearsonMatrix: trasf_matrix_pearson = lil_matrix(ma.matrix.shape) if args.obsexpMatrix: trasf_matrix_obsexp = lil_matrix(ma.matrix.shape) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.norm: exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome, chromosome_count) exp_obs_matrix_ = convertNansToZeros( csr_matrix(exp_obs_matrix_)).todense() exp_obs_matrix_ = convertInfsToZeros( csr_matrix(exp_obs_matrix_)).todense() else: exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix, length_chromosome, chromosome_count) exp_obs_matrix_ = convertNansToZeros( csr_matrix(exp_obs_matrix_)).todense() exp_obs_matrix_ = convertInfsToZeros( csr_matrix(exp_obs_matrix_)).todense() if args.obsexpMatrix: trasf_matrix_obsexp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( exp_obs_matrix_) pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() if args.pearsonMatrix: trasf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( pearson_correlation_matrix) corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) k = args.numberOfEigenvectors chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) vecs_list += eigs[:, :k].tolist() chrom_list += chrom start_list += start end_list += end if args.pearsonMatrix: file_type = 'cool' if args.pearsonMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables( trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.pearsonMatrix, pSymmetric=True, pApplyCorrection=False) if args.obsexpMatrix: file_type = 'cool' if args.obsexpMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables( trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.obsexpMatrix, pSymmetric=True, pApplyCorrection=False) if args.geneTrack: vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.geneTrack) if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): assert (len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format( toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) elif args.format == 'bigwig': if not pyBigWig.numpy == 1: log.error( "ERROR: Your version of pyBigWig is not supporting numpy: {}". format(pyBigWig.__file__)) exit(1) old_chrom = chrom_list[0] header = [] for i, _chrom in enumerate(chrom_list): if old_chrom != _chrom: header.append((toString(old_chrom), end_list[i - 1])) old_chrom = _chrom header.append((toString(chrom_list[-1]), end_list[-1])) for idx, outfile in enumerate(args.outputFileName): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) assert (len(vecs_list) == len(chrom_list)) _chrom_list = [] _start_list = [] _end_list = [] values = [] bw = pyBigWig.open(outfile, 'w') # set big wig header bw.addHeader(header) # create entry lists for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real values.append(value[idx]) _chrom_list.append(toString(chrom_list[i])) _start_list.append(start_list[i]) _end_list.append(end_list[i]) # write entries bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values) bw.close() else: log.error("Output format not known: {}".format(args.format)) exit(1)
def main(args=None): args = parse_arguments().parse_args(args) if args.verbose: log.setLevel(logging.INFO) # args.chromosomes if check_cooler(args.matrix) and args.chromosomes is not None and len( args.chromosomes) == 1: ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes)) else: ma = hm.hiCMatrix(args.matrix) if args.chromosomes: ma.reorderChromosomes(toString(args.chromosomes)) # mask all zero value bins if 'correctionMethod' in args: if args.correctionMethod == 'ICE': row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape if 'plotName' in args: row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape ma.matrix = convertNansToZeros(ma.matrix) ma.matrix = convertInfsToZeros(ma.matrix) ma.matrix = ma.matrix.astype(np.float64, copy=True) log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype)) log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype)) log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype)) # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices))) # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data))) # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr))) # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False) # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False) if 'plotName' in args: plot_total_contact_dist(ma, args) log.info("Saving diagnostic plot {}\n".format(args.plotName)) return log.info("matrix contains {} data points. Sparsity {:.3f}.".format( len(ma.matrix.data), float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2))) if args.skipDiagonal: ma.diagflat(value=0) total_filtered_out = set() if args.correctionMethod == 'ICE': if not args.filterThreshold: log.error('min and max filtering thresholds should be set') sys.exit(1) outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr) # compute and print some statistics pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0] ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) " "out of".format(pct_outlier, ma.matrix.shape[0]), restore_masked_bins=False) assert matrix_shape == ma.matrix.shape # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) assert type(coverage[0]) == np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False) ma.maskBins(failed_bins) total_filtered_out = set(failed_bins) """ ma.matrix, to_remove = fill_gaps(ma, failed_bins) log.warning("From {} failed bins, {} could " "not be filled\n".format(len(failed_bins), len(to_remove))) ma.maskBins(to_remove) """ if args.transCutoff and 0 < args.transCutoff < 100: cutoff = float(args.transCutoff) / 100 # a usual cutoff is 0.05 ma.truncTrans(high=cutoff) pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() correction_factors = [] corrected_matrix = lil_matrix(ma.matrix.shape) if args.perchr: # normalize each chromosome independently for chrname in list(ma.interval_trees): chr_range = ma.getChrBinRange(chrname) chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.correctionMethod == 'ICE': _matrix, _corr_factors = iterative_correction( chr_submatrix, args) corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix correction_factors.append(_corr_factors) else: # Set the kr matrix along with its correction factors vector assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing( chr_submatrix.shape[0], chr_submatrix.shape[1], chr_submatrix.count_nonzero(), chr_submatrix.indptr.astype(np.int64, copy=False), chr_submatrix.indices.astype(np.int64, copy=False), chr_submatrix.data.astype(np.float64, copy=False)) kr.computeKR() if args.outFileName.endswith('.h5'): corrected_matrix[ chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = kr.get_normalised_matrix( True) # correction_factors.append(np.true_divide(1, # kr.get_normalisation_vector(False).todense())) correction_factors.append( kr.get_normalisation_vector(False).todense()) correction_factors = np.concatenate(correction_factors) else: if args.correctionMethod == 'ICE': corrected_matrix, correction_factors = iterative_correction( ma.matrix, args) ma.setMatrixValues(corrected_matrix) else: assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1], ma.matrix.count_nonzero(), ma.matrix.indptr.astype(np.int64, copy=False), ma.matrix.indices.astype(np.int64, copy=False), ma.matrix.data.astype(np.float64, copy=False)) log.debug('passed pointers') kr.computeKR() log.debug('computation done') # set it to False since the vector is already normalised # with the previous True # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense()) correction_factors = kr.get_normalisation_vector(False).todense() if args.outFileName.endswith('.h5'): corrected_matrix = kr.get_normalised_matrix(True) if args.outFileName.endswith('.h5'): ma.setMatrixValues(corrected_matrix) # if ma.setCorrectionFactors(correction_factors) log.debug("Correction factors {}".format(correction_factors[:10])) if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE': after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten() # identify rows that were expanded more than args.inflationCutoff times to_remove = np.flatnonzero( after_row_sum / pre_row_sum >= args.inflationCutoff) ma.printchrtoremove(to_remove, label="inflated >={} " "regions".format(args.inflationCutoff), restore_masked_bins=False) total_filtered_out = total_filtered_out.union(to_remove) ma.maskBins(to_remove) ma.printchrtoremove(sorted(list(total_filtered_out)), label="Total regions to be removed", restore_masked_bins=False) ma.save(args.outFileName, pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) if args.title: args.title = remove_non_ascii(args.title) chrom = None start_pos1 = None chrom2 = None start_pos2 = None if args.perChromosome and args.region: log.error('ERROR, choose from the option ' '--perChromosome or --region, the two ' 'options at the same time are not ' 'compatible.') exit(1) # if args.region and args.region2 and args.bigwig: # log.error("Inter-chromosomal pca is not supported.") # exit(1) # is_cooler = False # if args.matrix.endswith('.cool') or cooler.io.is_cooler(args.matrix) or'.mcool' in args.matrix: is_cooler = check_cooler(args.matrix) log.debug("Cooler or no cooler: {}".format(is_cooler)) open_cooler_chromosome_order = True if args.chromosomeOrder is not None and len(args.chromosomeOrder) > 1: open_cooler_chromosome_order = False if is_cooler and not args.region2 and open_cooler_chromosome_order: log.debug("Retrieve data from cooler format and use its benefits.") regionsToRetrieve = None if args.region: regionsToRetrieve = [] regionsToRetrieve.append(args.region) # if args.region2: # chrom2, region_start2, region_end2 = translate_region(args.region2) # regionsToRetrieve.append(args.region2) if args.chromosomeOrder: args.region = None args.region2 = None regionsToRetrieve = args.chromosomeOrder ma = HiCMatrix.hiCMatrix(args.matrix, pChrnameList=regionsToRetrieve) log.debug('Shape {}'.format(ma.matrix.shape)) if args.clearMaskedBins: ma.maskBins(ma.nan_bins) # to avoid gaps in the plot, bins flanking the masked bins # are enlarged new_intervals = enlarge_bins(ma.cut_intervals) ma.setCutIntervals(new_intervals) if args.region: chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, ma) matrix = np.asarray(ma.matrix.todense().astype(float)) matrix_length = len(matrix[0]) log.debug("Number of data points matrix_cool: {}".format(matrix_length)) else: ma = HiCMatrix.hiCMatrix(args.matrix) if args.clearMaskedBins: ma.maskBins(ma.nan_bins) new_intervals = enlarge_bins(ma.cut_intervals) ma.setCutIntervals(new_intervals) if args.chromosomeOrder: args.region = None args.region2 = None valid_chromosomes = [] invalid_chromosomes = [] log.debug('args.chromosomeOrder: {}'.format(args.chromosomeOrder)) log.debug("ma.chrBinBoundaries {}".format(ma.chrBinBoundaries)) if sys.version_info[0] == 3: args.chromosomeOrder = toBytes(args.chromosomeOrder) for chrom in toString(args.chromosomeOrder): if chrom in ma.chrBinBoundaries: valid_chromosomes.append(chrom) else: invalid_chromosomes.append(chrom) if len(invalid_chromosomes) > 0: log.warning("WARNING: The following chromosome/scaffold names were not found. Please check" "the correct spelling of the chromosome names. \n") log.warning("\n".join(invalid_chromosomes)) ma.reorderChromosomes(valid_chromosomes) log.info("min: {}, max: {}\n".format(ma.matrix.data.min(), ma.matrix.data.max())) if args.region: chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, ma) matrix = np.asarray(ma.matrix[idx1, :][:, idx2].todense().astype(float)) else: log.debug("Else branch") matrix = np.asarray(ma.getMatrix().astype(float)) matrix_length = len(matrix[0]) log.debug("Number of data points matrix: {}".format(matrix_length)) for matrix_ in matrix: if not matrix_length == len(matrix_): log.error("Matrices do not have the same length: {} , {}".format(matrix_length, len(matrix_))) cmap = cm.get_cmap(args.colorMap) log.debug("Nan values set to black\n") cmap.set_bad('black') bigwig_info = None if args.bigwig: bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': ma.nan_bins} if args.perChromosome: fig = plotPerChr(ma, cmap, args, pBigwig=bigwig_info) else: norm = None if args.log or args.log1p: mask = matrix == 0 matrix[mask] = np.nanmin(matrix[mask == False]) if np.isnan(matrix).any() or np.isinf(matrix).any(): log.debug("any nan {}".format(np.isnan(matrix).any())) log.debug("any inf {}".format(np.isinf(matrix).any())) mask_nan = np.isnan(matrix) mask_inf = np.isinf(matrix) matrix[mask_nan] = np.nanmin(matrix[mask_nan == False]) matrix[mask_inf] = np.nanmin(matrix[mask_inf == False]) log.debug("any nan after remove of nan: {}".format(np.isnan(matrix).any())) log.debug("any inf after remove of inf: {}".format(np.isinf(matrix).any())) if args.log1p: matrix += 1 norm = LogNorm() elif args.log: norm = LogNorm() if args.bigwig: # increase figure height to accommodate bigwig track fig_height = 8.5 else: fig_height = 7 height = 4.8 / fig_height fig_width = 8 width = 5.0 / fig_width left_margin = (1.0 - width) * 0.5 fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi) if args.bigwig: gs = gridspec.GridSpec(2, 2, height_ratios=[0.90, 0.1], width_ratios=[0.97, 0.03]) gs.update(hspace=0.05, wspace=0.05) ax1 = plt.subplot(gs[0, 0]) ax2 = plt.subplot(gs[1, 0]) ax3 = plt.subplot(gs[0, 1]) bigwig_info['axis'] = ax2 bigwig_info['axis_colorbar'] = ax3 else: ax1 = None bottom = 1.3 / fig_height if start_pos1 is None: start_pos1 = make_start_pos_array(ma) position = [left_margin, bottom, width, height] plotHeatmap(matrix, ma.get_chromosome_sizes(), fig, position, args, cmap, xlabel=chrom, ylabel=chrom2, start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=ax1, pBigwig=bigwig_info) if not args.disable_tight_layout: if args.perChromosome or args.bigwig: try: plt.tight_layout() except UserWarning: log.info("Failed to tight layout. Using regular plot.") except ValueError: log.info("Failed to tight layout. Using regular plot.") plt.savefig(args.outFileName, dpi=args.dpi) plt.close(fig)
def plotPerChr(hic_matrix, cmap, args, pBigwig): """ plots each chromosome individually, one after the other in one row. scale bar is added at the end """ from math import ceil chromosomes = hic_matrix.getChrNames() chrom_per_row = 5 num_rows = int(ceil(float(len(chromosomes)) / chrom_per_row)) num_cols = min(chrom_per_row, len(chromosomes)) width_ratios = [1.0] * num_cols + [0.05] grids = gridspec.GridSpec(num_rows, num_cols + 1, width_ratios=width_ratios, height_ratios=[1] * num_rows) fig_height = 6 * num_rows fig_width = sum((np.array(width_ratios) + 0.05) * 6) fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi) chrom, start, end, _ = zip(*hic_matrix.cut_intervals) for idx, chrname in enumerate(chromosomes): log.debug('chrom: {}'.format(chrname)) row = idx // chrom_per_row col = idx % chrom_per_row if pBigwig: inner_grid = gridspec.GridSpecFromSubplotSpec(2, 2, height_ratios=[0.85, 0.15], width_ratios=[0.93, 0.07], subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) axis = plt.subplot(inner_grid[0, 0]) axis_eigenvector = plt.subplot(inner_grid[1, 0]) axis_scale = plt.subplot(inner_grid[0, 1]) else: axis = plt.subplot(grids[row, col]) axis.set_title(toString(chrname)) chrom_range = hic_matrix.getChrBinRange(chrname) matrix = np.asarray(hic_matrix.matrix[chrom_range[0]:chrom_range[1], chrom_range[0]:chrom_range[1]].todense().astype(float)) norm = None if args.log or args.log1p: mask = matrix == 0 mask_nan = np.isnan(matrix) mask_inf = np.isinf(matrix) log.debug("any nan {}".format(np.isnan(matrix).any())) log.debug("any inf {}".format(np.isinf(matrix).any())) try: matrix[mask] = np.nanmin(matrix[mask == False]) matrix[mask_nan] = np.nanmin(matrix[mask_nan == False]) matrix[mask_inf] = np.nanmin(matrix[mask_inf == False]) except Exception: log.debug("Clearing of matrix failed.") log.debug("any nanafter remove of nan: {}".format(np.isnan(matrix).any())) log.debug("any inf after remove of inf: {}".format(np.isinf(matrix).any())) if args.log1p: matrix += 1 norm = LogNorm() elif args.log: norm = LogNorm() bigwig_info = None if pBigwig: bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': hic_matrix.nan_bins} bigwig_info['axis'] = axis_eigenvector bigwig_info['axis_colorbar'] = axis_scale chr_bin_boundary = OrderedDict() chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname] args.region = toString(chrname) chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, hic_matrix) plotHeatmap(matrix, chr_bin_boundary, fig, None, args, cmap, xlabel=chrname, ylabel=chrname, start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=axis, pBigwig=bigwig_info) return fig
def plotHeatmap(ma, chrBinBoundaries, fig, position, args, cmap, xlabel=None, ylabel=None, start_pos=None, start_pos2=None, pNorm=None, pAxis=None, pBigwig=None): log.debug("plotting heatmap") if ma.shape[0] < 5: # This happens when a tiny matrix wants to be plotted, or by using per chromosome and # a small chromosome (eg. contig) is present. # Otherwise, pcolormesh will throw an error if the matrix size is 1. chr_names = " ".join([toString(x) for x in chrBinBoundaries.keys()]) log.info("Matrix for {} too small to plot. Matrix size: {}".format(chr_names, ma.shape)) return if pAxis is not None: axHeat2 = pAxis else: axHeat2 = fig.add_axes(position) if args.title: axHeat2.set_title(toString(args.title)) if start_pos2 is None: start_pos2 = start_pos xmesh, ymesh = np.meshgrid(start_pos, start_pos2) img3 = axHeat2.pcolormesh(xmesh.T, ymesh.T, ma, vmin=args.vMin, vmax=args.vMax, cmap=cmap, norm=pNorm) axHeat2.invert_yaxis() img3.set_rasterized(True) if args.region: xtick_lables = relabel_ticks(axHeat2.get_xticks()) axHeat2.get_xaxis().set_tick_params(which='both', bottom='on', direction='out') axHeat2.set_xticklabels(xtick_lables, size='small', rotation=45) ytick_lables = relabel_ticks(axHeat2.get_yticks()) axHeat2.get_yaxis().set_tick_params(which='both', bottom='on', direction='out') axHeat2.set_yticklabels(ytick_lables, size='small') xticks = [xtick_lables] """ axHeat2.set_xticks([0, ma.shape[0]]) axHeat2.set_xticklabels([args.region[1], args.region[2]], size=4, rotation=90) axHeat2.set_axis_off() """ else: pos = 0 ticks = [] for chr_size in chrBinBoundaries.values(): ticks.append(pos) pos += chr_size # ticks = [int(pos[0] + (pos[1] - pos[0]) / 2) for pos in itervalues(chrBinBoundaries)] labels = list(chrBinBoundaries) axHeat2.set_xticks(ticks) axHeat2.set_yticks(ticks) labels = toString(labels) xticks = [labels, ticks] if len(labels) > 20: axHeat2.set_xticklabels(labels, size=4, rotation=90) axHeat2.set_yticklabels(labels, size=4) else: axHeat2.set_xticklabels(labels, size=8) axHeat2.set_yticklabels(labels, size=8) if pBigwig is None: divider = make_axes_locatable(axHeat2) cax = divider.append_axes("right", size="2.5%", pad=0.09) else: cax = pBigwig['axis_colorbar'] cbar = fig.colorbar(img3, cax=cax) cbar.solids.set_edgecolor("face") # to avoid white lines in the color bar in pdf plots if args.scoreName: cbar.ax.set_ylabel(args.scoreName, rotation=270, size=8) if ylabel is not None: ylabel = toString(ylabel) axHeat2.set_ylabel(ylabel) if xlabel is not None: xlabel = toString(xlabel) axHeat2.set_xlabel(xlabel) log.debug('foo') if pBigwig: axHeat2.xaxis.set_label_position("top") axHeat2.xaxis.tick_top() if args.region: log.debug('region') plotBigwig(pBigwig['axis'], pBigwig['args'].bigwig, pChromosomeSizes=chrBinBoundaries, pRegion=pBigwig['args'].region, pXticks=xticks, pFlipBigwigSign=args.flipBigwigSign, pScaleFactorBigwig=args.scaleFactorBigwig, pValueMin=args.vMinBigwig, pValueMax=args.vMaxBigwig) else: log.debug('else region') plotBigwig(pBigwig['axis'], pBigwig['args'].bigwig, pXticks=xticks, pChromosomeSizes=chrBinBoundaries, pFlipBigwigSign=args.flipBigwigSign, pScaleFactorBigwig=args.scaleFactorBigwig, pValueMin=args.vMinBigwig, pValueMax=args.vMaxBigwig)
def main(args=None): args = parse_arguments().parse_args(args) if int(args.numberOfEigenvectors) != len(args.outputFileName): log.error( "Number of output file names and number of eigenvectors does not match. Please" "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}" .format(args.outputFileName, args.numberOfEigenvectors)) exit(1) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) log.debug("Computing pca for chromosome: {}".format(chrname)) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix, length_chromosome, chromosome_count) exp_obs_matrix_ = convertNansToZeros( csr_matrix(exp_obs_matrix_)).todense() exp_obs_matrix_ = convertInfsToZeros( csr_matrix(exp_obs_matrix_)).todense() pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) k = args.numberOfEigenvectors chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) vecs_list += eigs[:, :k].tolist() chrom_list += chrom start_list += start end_list += end if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): assert (len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format( toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) elif args.format == 'bigwig': if not pyBigWig.numpy == 1: log.error( "ERROR: Your version of pyBigWig is not supporting numpy: {}". format(pyBigWig.__file__)) exit(1) old_chrom = chrom_list[0] header = [] for i, chrom_ in enumerate(chrom_list): if old_chrom != chrom_: header.append((toString(old_chrom), end_list[i - 1])) old_chrom = chrom_ header.append((toString(chrom_list[-1]), end_list[-1])) for idx, outfile in enumerate(args.outputFileName): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) assert (len(vecs_list) == len(chrom_list)) chrom_list_ = [] start_list_ = [] end_list_ = [] values = [] bw = pyBigWig.open(outfile, 'w') # set big wig header bw.addHeader(header) # create entry lists for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real values.append(value[idx]) chrom_list_.append(toString(chrom_list[i])) start_list_.append(start_list[i]) end_list_.append(end_list[i]) # write entries bw.addEntries(chrom_list_, start_list_, ends=end_list_, values=values) bw.close() else: log.error("Output format not known: {}".format(args.format)) exit(1)
def get_bed_interval(self, bed_line): r""" Processes each bed line from a bed file, casts the values and returns a namedtuple object >>> bed_line="chr1\t0\t1000\tgene_1\t0.5\t-\t0\t1000\t0\t3\t10,20,100\t20,200,700" >>> with open('/tmp/test.bed', 'w') as fh: ... foo = fh.write(bed_line) >>> bed_f = ReadBed(open('/tmp/test.bed','r')) >>> bed = bed_f.get_bed_interval(bed_line) >>> bed.chromosome 'chr1' >>> bed.block_starts [20, 200, 700] >>> bed_line="chr2\t0\t1000\tgene_1\t0.5\t-\n" >>> with open('/tmp/test.bed', 'w') as fh: ... foo = fh.write(bed_line) >>> bed_f = ReadBed(open('/tmp/test.bed','r')) >>> bed_f.get_bed_interval(bed_line) BedInterval(chromosome='chr2', start=0, end=1000, name='gene_1', score=0.5, strand='-') """ line_data = bed_line.strip() line_data = toString(line_data) line_data = line_data.split("\t") if self.file_handle == 'bed12': assert len(line_data) == 12, "File type detected is bed12 but line {}: {} does " \ "not have 12 fields.".format(self.line_number, bed_line) elif self.file_type == 'bed3': assert len(line_data) == 3, "File type detected is bed3 but line {}: {} does " \ "not have 3 fields.".format(self.line_number, bed_line) elif self.file_type == 'bed6': assert len(line_data) == 6, "File type detected is bed6 but line {}: {} does " \ "not have 6 fields.".format(self.line_number, bed_line) line_values = [] for idx, r in enumerate(line_data): # first field is always chromosome/contig name # and should be cast as a string # same for field 3 (name) if idx in [0, 3]: line_values.append(r) # check field strand elif idx == 5: if r not in ['+', '-', '.']: if r == '1': r = '+' elif r == '-1': r = '-' else: log.warning( "*Warning, invalid strand value found {} for line #{}:\n{}\n " "Setting strand to '.'\n".format( r, bed_line, self.line_number)) r = '.' line_values.append(r) elif idx in [1, 2, 6, 7, 9]: # start and end fields must be integers, same for thichStart(6), # and thickEnd(7) and blockCount(9) fields try: line_values.append(int(r)) except ValueError: log.warning( "Value: {} in field {} at line {} is not an integer\n". format(r, idx + 1, self.line_number)) return dict() # check item rgb elif idx == 8: r = toString(r) rgb = r.split(",") if len(rgb) == 3: try: r = map(int, rgb) except ValueError as detail: log.debug( "Error reading line: #{}. The rgb field {} is not " "valid.\nError message: {}\n".format( self.line_number, r, detail)) line_values.append(r) elif idx in [10, 11]: # this are the block sizes and block start positions r = toString(r) r_parts = r.split(',') try: r = [int(x) for x in r_parts if x != ''] except ValueError as detail: log.debug( "Error reading line #{}. The block field {} is not " "valid.\nError message: {}\n".format( self.line_number, r, detail)) line_values.append(r) else: try: tmp = float(r) except ValueError: tmp = r except TypeError: tmp = r line_values.append(tmp) assert line_values[2] > line_values[1], \ "Start position larger or equal than end for line #{}:\n{}\n".format(self.line_number, bed_line) if self.file_type == 'bed3': line_values = line_values[0:3] # in case of a bed3, the id, score and strand # values are added as ".", 0, "." respectively line_values.extend([".", 0, "."]) elif self.file_type == 'bed6': line_values = line_values[0:6] return self.BedInterval._make(line_values)
def plotHeatmap(ma, chrBinBoundaries, fig, position, args, cmap, xlabel=None, ylabel=None, start_pos=None, start_pos2=None, pNorm=None, pAxis=None, pPca=None): log.debug("plotting heatmap") if ma.shape[0] < 5: log.info("Matrix for {} too small to plot. Matrix size: {}".format( chrBinBoundaries.keys()[0], ma.shape)) return if pAxis is not None: axHeat2 = pAxis else: axHeat2 = fig.add_axes(position) if args.title: axHeat2.set_title(toString(args.title)) if start_pos is None: start_pos = np.arange(ma.shape[0]) if start_pos2 is None: start_pos2 = start_pos xmesh, ymesh = np.meshgrid(start_pos, start_pos2) img3 = axHeat2.pcolormesh(xmesh.T, ymesh.T, ma, vmin=args.vMin, vmax=args.vMax, cmap=cmap, norm=pNorm) axHeat2.invert_yaxis() img3.set_rasterized(True) xticks = None if args.region: xtick_lables = relabel_ticks(axHeat2.get_xticks()) axHeat2.get_xaxis().set_tick_params(which='both', bottom='on', direction='out') axHeat2.set_xticklabels(xtick_lables, size='small', rotation=45) ytick_lables = relabel_ticks(axHeat2.get_yticks()) axHeat2.get_yaxis().set_tick_params(which='both', bottom='on', direction='out') axHeat2.set_yticklabels(ytick_lables, size='small') xticks = [xtick_lables] """ axHeat2.set_xticks([0, ma.shape[0]]) axHeat2.set_xticklabels([args.region[1], args.region[2]], size=4, rotation=90) axHeat2.set_axis_off() """ else: ticks = [ int(pos[0] + (pos[1] - pos[0]) / 2) for pos in itervalues(chrBinBoundaries) ] labels = list(chrBinBoundaries) axHeat2.set_xticks(ticks) axHeat2.set_yticks(ticks) labels = toString(labels) xticks = [labels, ticks] if len(labels) > 20: axHeat2.set_xticklabels(labels, size=4, rotation=90) axHeat2.set_yticklabels(labels, size=4) else: axHeat2.set_xticklabels(labels, size=8) axHeat2.set_yticklabels(labels, size=8) if pPca is None: divider = make_axes_locatable(axHeat2) cax = divider.append_axes("right", size="2.5%", pad=0.09) else: cax = pPca['axis_colorbar'] if args.log1p: from matplotlib.ticker import LogFormatter formatter = LogFormatter(10, labelOnlyBase=False) # get a useful log scale # that looks like [1, 2, 5, 10, 20, 50, 100, ... etc] aa = np.array([1, 2, 5]) tick_values = np.concatenate([aa * 10**x for x in range(10)]) cbar = fig.colorbar(img3, ticks=tick_values, format=formatter, cax=cax) else: cbar = fig.colorbar(img3, cax=cax) cbar.solids.set_edgecolor( "face") # to avoid white lines in the color bar in pdf plots if args.scoreName: cbar.ax.set_ylabel(args.scoreName, rotation=270, size=8) if ylabel is not None: ylabel = toString(ylabel) axHeat2.set_ylabel(ylabel) if xlabel is not None: xlabel = toString(xlabel) axHeat2.set_xlabel(xlabel) if pPca: axHeat2.xaxis.set_label_position("top") axHeat2.xaxis.tick_top() if args.region: plotEigenvector(pPca['axis'], pPca['args'].pca, pRegion=pPca['args'].region, pXticks=xticks) else: plotEigenvector(pPca['axis'], pPca['args'].pca, pXticks=xticks, pChromosomeList=labels)
def plotPerChr(hic_matrix, cmap, args, pBigwig, pResolution): """ plots each chromosome individually, one after the other in one row. scale bar is added at the end """ from math import ceil chromosomes = hic_matrix.getChrNames() chrom_per_row = 5 num_rows = int(ceil(float(len(chromosomes)) / chrom_per_row)) num_cols = min(chrom_per_row, len(chromosomes)) width_ratios = [1.0] * num_cols + [0.05] grids = gridspec.GridSpec(num_rows, num_cols + 1, width_ratios=width_ratios, height_ratios=[1] * num_rows) fig_height = 6 * num_rows fig_width = sum((np.array(width_ratios) + 0.05) * 6) if pBigwig: for i in range(len(args.bigwig)): fig_height += args.increaseFigureHeight # if args.bigwigAdditionalVerticalAxis: fig_width += args.increaseFigureWidth fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi) chrom, start, end, _ = zip(*hic_matrix.cut_intervals) for idx, chrname in enumerate(chromosomes): log.debug('chrom: {}'.format(chrname)) bigwig_info = None # if pBigwig: # bigwig_info['axis'] = axis_eigenvector # bigwig_info['axis_colorbar'] = axis_scale row = idx // chrom_per_row col = idx % chrom_per_row if pBigwig: bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': hic_matrix.nan_bins} # bigwig_info, axis = bigwig_axes_config(args, bigwig_info) # bigwig_info['nan_bins'] = hic_matrix.nan_bins # bigwig_info['args'] = args # inner_grid = gridspec.GridSpecFromSubplotSpec(2, 2, height_ratios=[0.85, 0.15], width_ratios=[0.93, 0.07], # subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) # axis = plt.subplot(inner_grid[0, 0]) # axis_eigenvector = plt.subplot(inner_grid[1, 0]) # axis_scale = plt.subplot(inner_grid[0, 1]) number_of_rows_plot = len(args.bigwig) bigwig_heights = [0.07] * number_of_rows_plot bigwig_height_ratio = 0.95 - (0.07 * number_of_rows_plot) if bigwig_height_ratio < 0.4: bigwig_height_ratio = 0.4 _ratio = 0.6 / len(number_of_rows_plot) bigwig_heights = [_ratio] * number_of_rows_plot if args.bigwigAdditionalVerticalAxis: # gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 3, height_ratios=[0.90, 0.1], width_ratios=[0.15, 0.82, 0.03], # subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) # # gs = gridspec.GridSpec(1 + len(args.bigwig), 3, height_ratios=[0.90, 0.1], width_ratios=[0.15, 0.82, 0.03]) # # gs.update(hspace=0.05, wspace=0.05) # bigwig_vertical_axis = plt.subplot(gs[0, 0]) # axis = plt.subplot(gs[0, 1]) # ax2 = plt.subplot(gs[1, 1]) # ax3 = plt.subplot(gs[0, 2]) # bigwig_info['axis'] = ax2 # bigwig_info['axis_colorbar'] = ax3 # bigwig_info['axis_vertical'] = bigwig_vertical_axis gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 2 + len(args.bigwig), height_ratios=[0.95 - (0.07 * number_of_rows_plot), *bigwig_heights], width_ratios=[*bigwig_heights, 0.97 - (0.07 * number_of_rows_plot), 0.03], subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) # gs.update(hspace=0.05, wspace=0.05) # gs.update(hspace=0.05, wspace=0.05) axis = plt.subplot(gs[0, len(args.bigwig)]) ax2_list = [] for i in range(len(args.bigwig)): ax2_list.append(plt.subplot(gs[1 + i, len(args.bigwig)])) bigwig_vertical_axis_list = [] for i in range(len(args.bigwig)): bigwig_vertical_axis_list.append(plt.subplot(gs[0, i])) # ax2 = plt.subplot(gs[1, 0]) ax3 = plt.subplot(gs[0, len(args.bigwig) + 1]) bigwig_info['axis'] = ax2_list bigwig_info['axis_colorbar'] = ax3 bigwig_info['axis_vertical'] = bigwig_vertical_axis_list else: # [0.95 - (0.07 * number_of_rows_plot), *z_score_heights], width_ratios=[0.75, 0.25]) gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 2, height_ratios=[0.95 - (0.07 * number_of_rows_plot), *bigwig_heights], width_ratios=[0.97, 0.03], subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) # gs.update(hspace=0.05, wspace=0.05) axis = plt.subplot(gs[0, 0]) ax2_list = [] for i in range(len(args.bigwig)): ax2_list.append(plt.subplot(gs[1 + i, 0])) # ax2 = plt.subplot(gs[1, 0]) ax3 = plt.subplot(gs[0, 1]) bigwig_info['axis'] = ax2_list bigwig_info['axis_colorbar'] = ax3 else: axis = plt.subplot(grids[row, col]) axis.set_title(toString(chrname)) chrom_range = hic_matrix.getChrBinRange(chrname) matrix = np.asarray(hic_matrix.matrix[chrom_range[0]:chrom_range[1], chrom_range[0]:chrom_range[1]].todense().astype(float)) norm = None if args.log or args.log1p: mask = matrix == 0 mask_nan = np.isnan(matrix) mask_inf = np.isinf(matrix) log.debug("any nan {}".format(np.isnan(matrix).any())) log.debug("any inf {}".format(np.isinf(matrix).any())) try: matrix[mask] = np.nanmin(matrix[mask == False]) matrix[mask_nan] = np.nanmin(matrix[mask_nan == False]) matrix[mask_inf] = np.nanmin(matrix[mask_inf == False]) except Exception: log.debug("Clearing of matrix failed.") log.debug("any nanafter remove of nan: {}".format( np.isnan(matrix).any())) log.debug("any inf after remove of inf: {}".format( np.isinf(matrix).any())) if args.log1p: matrix += 1 norm = LogNorm() elif args.log: norm = LogNorm() chr_bin_boundary = OrderedDict() chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname] args.region = toString(chrname) chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion( args, hic_matrix) plotHeatmap(matrix, chr_bin_boundary, fig, None, args, cmap, xlabel=chrname, ylabel=chrname, start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=axis, pBigwig=bigwig_info, pChromsomeStartEndDict=chromosome_start_end(hic_matrix), pResolution=pResolution) return fig
def main(args=None): args = parse_arguments().parse_args(args) if args.verbose: log.setLevel(logging.INFO) # args.chromosomes if check_cooler(args.matrix) and args.chromosomes is not None and len(args.chromosomes) == 1: ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes)) else: ma = hm.hiCMatrix(args.matrix) if args.chromosomes: ma.reorderChromosomes(toString(args.chromosomes)) # mask all zero value bins row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape ma.matrix = convertNansToZeros(ma.matrix) ma.matrix = convertInfsToZeros(ma.matrix) if 'plotName' in args: plot_total_contact_dist(ma, args) log.info("Saving diagnostic plot {}\n".format(args.plotName)) return log.info("matrix contains {} data points. Sparsity {:.3f}.".format( len(ma.matrix.data), float(len(ma.matrix.data)) / (ma.matrix.shape[0] ** 2))) if args.skipDiagonal: ma.diagflat(value=0) outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr) # compute and print some statistics pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0] ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) " "out of".format(pct_outlier, ma.matrix.shape[0]), restore_masked_bins=False) assert matrix_shape == ma.matrix.shape # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) assert type(coverage[0]) == np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False) ma.maskBins(failed_bins) total_filtered_out = set(failed_bins) """ ma.matrix, to_remove = fill_gaps(ma, failed_bins) log.warning("From {} failed bins, {} could " "not be filled\n".format(len(failed_bins), len(to_remove))) ma.maskBins(to_remove) """ if args.transCutoff and 0 < args.transCutoff < 100: cutoff = float(args.transCutoff) / 100 # a usual cutoff is 0.05 ma.truncTrans(high=cutoff) pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() correction_factors = [] if args.perchr: corrected_matrix = lil_matrix(ma.matrix.shape) # normalize each chromosome independently for chrname in list(ma.interval_trees): chr_range = ma.getChrBinRange(chrname) chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] _matrix, _corr_factors = iterative_correction(chr_submatrix, args) corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix correction_factors.append(_corr_factors) correction_factors = np.concatenate(correction_factors) else: corrected_matrix, correction_factors = iterative_correction(ma.matrix, args) ma.setMatrixValues(corrected_matrix) ma.setCorrectionFactors(correction_factors) log.info("Correction factors {}".format(correction_factors[:10])) if args.inflationCutoff and args.inflationCutoff > 0: after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten() # identify rows that were expanded more than args.inflationCutoff times to_remove = np.flatnonzero(after_row_sum / pre_row_sum >= args.inflationCutoff) ma.printchrtoremove(to_remove, label="inflated >={} " "regions".format(args.inflationCutoff), restore_masked_bins=False) total_filtered_out = total_filtered_out.union(to_remove) ma.maskBins(to_remove) ma.printchrtoremove(sorted(list(total_filtered_out)), label="Total regions to be removed", restore_masked_bins=False) ma.save(args.outFileName, pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) for matrix in args.matrices: # if generated_by = None genome_assembly = None statistics = None generated_by_cooler_lib = None tool_url = None matrix_generated_by = None matrix_generated_by_url = None creation_date = None bin_length = None size = None nchroms = None num_non_zero = None min_non_zero = None max_non_zero = None sum_elements = None num_nan_bins = None if check_cooler(matrix) and args.no_metadata: cooler_file = cooler.Cooler(matrix) if cooler_file.info is not None: # log.debug('cooler_file.info {}'.format(cooler_file.info)) if 'bin-size' in cooler_file.info: bin_length = cooler_file.info['bin-size'] if 'nbins' in cooler_file.info: size = cooler_file.info['nbins'] if 'nchroms' in cooler_file.info: nchroms = cooler_file.info['nchroms'] # if 'chromosomes' in cooler_file.info: # chromosomes = cooler_file.info['chromosomes'] if 'nnz' in cooler_file.info: num_non_zero = cooler_file.info['nnz'] if 'min-value' in cooler_file.info: min_non_zero = cooler_file.info['min-value'] if 'max-value' in cooler_file.info: max_non_zero = cooler_file.info['max-value'] if 'generated-by' in cooler_file.info: generated_by = toString(cooler_file.info['generated-by']) if 'genome-assembly' in cooler_file.info: genome_assembly = toString( cooler_file.info['genome-assembly']) if 'metadata' in cooler_file.info: if cooler_file.info['metadata'] is not None: if 'statistics' in cooler_file.info['metadata']: statistics = cooler_file.info['metadata']['statistics'] if 'generated-by-cooler-lib' in cooler_file.info: generated_by_cooler_lib = toString( cooler_file.info['generated-by-cooler-lib']) if 'tool-url' in cooler_file.info: tool_url = toString(cooler_file.info['tool-url']) if 'matrix-generated-by' in cooler_file.info: matrix_generated_by = toString( cooler_file.info['matrix-generated-by']) if 'matrix-generated-by-url' in cooler_file.info: matrix_generated_by_url = toString( cooler_file.info['matrix-generated-by-url']) if 'creation-date' in cooler_file.info: creation_date = cooler_file.info['creation-date'] if 'sum-elements' in cooler_file.info: sum_elements = cooler_file.info['sum-elements'] chromosome_sizes = cooler_file.chromsizes else: hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = ((hic_ma.matrix.sum() - hic_ma.matrix.diagonal().sum()) / 2) + hic_ma.matrix.diagonal().sum() bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() # chromosomes = list(hic_ma.chrBinBoundaries) chromosome_sizes = hic_ma.get_chromosome_sizes() information = StringIO() information.write( "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n".format(__version__)) if matrix is not None: information.write("File:\t{}\n".format(matrix)) if creation_date is not None: information.write("Date:\t{}\n".format(creation_date)) if genome_assembly is not None: information.write("Genome assembly:\t{}\n".format(genome_assembly)) if size is not None: information.write("Size:\t{:,}\n".format(size)) if bin_length is not None: information.write("Bin_length:\t{}\n".format(bin_length)) if sum_elements is not None: information.write("Sum of matrix:\t{}\n".format(sum_elements)) # if chromosomes is not None: # information.write("Chromosomes:\t{}\n".format( # ", ".join(toString(chromosomes)))) information.write("Chromosomes:length: ") for key, value in chromosome_sizes.items(): information.write("{}: {} bp; ".format(key, value)) information.write('\n') if nchroms is not None: information.write("Number of chromosomes:\t{}\n".format(nchroms)) if num_non_zero is not None: information.write( "Non-zero elements:\t{:,}\n".format(num_non_zero)) if min_non_zero is not None: information.write("Minimum (non zero):\t{}\n".format(min_non_zero)) if max_non_zero is not None: information.write("Maximum:\t{}\n".format(max_non_zero)) if num_nan_bins is not None: information.write("NaN bins:\t{}\n".format(num_nan_bins)) if check_cooler(matrix): information.write('The following columns are available: {}\n'.format( cooler.Cooler(matrix).bins().columns.values)) if generated_by is not None: information.write("\n\nGenerated by:\t{}\n".format(generated_by)) if generated_by_cooler_lib is not None: information.write("Cooler library version:\t{}\n".format( generated_by_cooler_lib)) if tool_url is not None: information.write("HiCMatrix url:\t{}\n".format(tool_url)) if matrix_generated_by is not None: information.write( "Interaction matrix created with:\t{}\n".format(matrix_generated_by)) if matrix_generated_by_url is not None: information.write("URL:\t{}\n".format(matrix_generated_by_url)) if statistics is not None: information.write("\n\nBuild statistics:\n{}\n".format(statistics)) if args.outFileName: with open(args.outFileName, 'w') as file: file.write(information.getvalue()) else: print(information.getvalue()) information.close()