def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount): obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)).todense() return obs_exp_matrix_
def _pearson(pSubmatrix): pearson_correlation_matrix = np.corrcoef(pSubmatrix) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)) pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() return pearson_correlation_matrix
def _pearson(pSubmatrix): pearson_correlation_matrix = np.corrcoef(pSubmatrix) pearson_correlation_matrix = convertNansToZeros(csr_matrix(pearson_correlation_matrix)) pearson_correlation_matrix = convertInfsToZeros(csr_matrix(pearson_correlation_matrix)) # if len(pearson_correlation_matrix.data) == 0: # return np.array([[]]) return pearson_correlation_matrix # .todense()
def __obs_exp(pSubmatrix, pLengthChromosome, pChromosomeCount): exp_obs_matrix_ = exp_obs_matrix_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount) exp_obs_matrix_ = convertNansToZeros(csr_matrix(exp_obs_matrix_)) exp_obs_matrix_ = convertInfsToZeros(csr_matrix(exp_obs_matrix_)).todense() return exp_obs_matrix_
def _obs_exp_non_zero(pSubmatrix, ligation_factor): obs_exp_matrix_ = obs_exp_matrix_non_zero(pSubmatrix, ligation_factor) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)) # if len(obs_exp_matrix_.data) == 0: # return np.array([[]]) return obs_exp_matrix_ # .todense()
def _obs_exp(pSubmatrix): obs_exp_matrix_ = obs_exp_matrix(pSubmatrix) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)) # if len(obs_exp_matrix_.data) == 0: # return np.array([[]]) return obs_exp_matrix_ # .todense()
def _obs_exp_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount): obs_exp_matrix_ = obs_exp_matrix_lieberman(pSubmatrix, pLengthChromosome, pChromosomeCount) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)) # if len(obs_exp_matrix_.data) == 0: # return np.array() return obs_exp_matrix_ # .todense()
def get_expected_matrix(pSubmatrix): expected_interactions_in_distance = expected_interactions(pSubmatrix) row, col = pSubmatrix.nonzero() distance = np.ceil(np.absolute(row - col) / 2).astype(np.int32) expected = expected_interactions_in_distance[distance] pSubmatrix.data = expected pSubmatrix = convertNansToZeros(csr_matrix(pSubmatrix)) pSubmatrix = convertInfsToZeros(csr_matrix(pSubmatrix)).todense() return pSubmatrix
def _obs_exp(pSubmatrix): obs_exp_matrix_ = obs_exp_matrix(pSubmatrix) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)) # log.error('obs_exp_matrix_.data {}'.format(obs_exp_matrix_.data)) # if len(obs_exp_matrix_.data) == 0: # log.debug('No data!') # return np.array([[]]) return obs_exp_matrix_ # .todense()
def main(args=None): args = parse_arguments().parse_args(args) if int(args.numberOfEigenvectors) != len(args.outputFileName): log.error( "Number of output file names and number of eigenvectors does not match. Please" "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}" .format(args.outputFileName, args.numberOfEigenvectors)) exit(1) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) if args.pearsonMatrix: trasf_matrix_pearson = lil_matrix(ma.matrix.shape) if args.obsexpMatrix: trasf_matrix_obsexp = lil_matrix(ma.matrix.shape) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.norm: exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome, chromosome_count) exp_obs_matrix_ = convertNansToZeros( csr_matrix(exp_obs_matrix_)).todense() exp_obs_matrix_ = convertInfsToZeros( csr_matrix(exp_obs_matrix_)).todense() else: exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix, length_chromosome, chromosome_count) exp_obs_matrix_ = convertNansToZeros( csr_matrix(exp_obs_matrix_)).todense() exp_obs_matrix_ = convertInfsToZeros( csr_matrix(exp_obs_matrix_)).todense() if args.obsexpMatrix: trasf_matrix_obsexp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( exp_obs_matrix_) pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() if args.pearsonMatrix: trasf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( pearson_correlation_matrix) corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) k = args.numberOfEigenvectors chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) vecs_list += eigs[:, :k].tolist() chrom_list += chrom start_list += start end_list += end if args.pearsonMatrix: file_type = 'cool' if args.pearsonMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables( trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.pearsonMatrix, pSymmetric=True, pApplyCorrection=False) if args.obsexpMatrix: file_type = 'cool' if args.obsexpMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables( trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.obsexpMatrix, pSymmetric=True, pApplyCorrection=False) if args.geneTrack: vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.geneTrack) if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): assert (len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format( toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) elif args.format == 'bigwig': if not pyBigWig.numpy == 1: log.error( "ERROR: Your version of pyBigWig is not supporting numpy: {}". format(pyBigWig.__file__)) exit(1) old_chrom = chrom_list[0] header = [] for i, _chrom in enumerate(chrom_list): if old_chrom != _chrom: header.append((toString(old_chrom), end_list[i - 1])) old_chrom = _chrom header.append((toString(chrom_list[-1]), end_list[-1])) for idx, outfile in enumerate(args.outputFileName): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) assert (len(vecs_list) == len(chrom_list)) _chrom_list = [] _start_list = [] _end_list = [] values = [] bw = pyBigWig.open(outfile, 'w') # set big wig header bw.addHeader(header) # create entry lists for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real values.append(value[idx]) _chrom_list.append(toString(chrom_list[i])) _start_list.append(start_list[i]) _end_list.append(end_list[i]) # write entries bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values) bw.close() else: log.error("Output format not known: {}".format(args.format)) exit(1)
def main(args=None): args = parse_arguments().parse_args(args) if args.verbose: log.setLevel(logging.INFO) # args.chromosomes if check_cooler(args.matrix) and args.chromosomes is not None and len( args.chromosomes) == 1: ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes)) else: ma = hm.hiCMatrix(args.matrix) if args.chromosomes: ma.reorderChromosomes(toString(args.chromosomes)) # mask all zero value bins if 'correctionMethod' in args: if args.correctionMethod == 'ICE': row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape if 'plotName' in args: row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape ma.matrix = convertNansToZeros(ma.matrix) ma.matrix = convertInfsToZeros(ma.matrix) ma.matrix = ma.matrix.astype(np.float64, copy=True) log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype)) log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype)) log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype)) # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices))) # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data))) # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr))) # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False) # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False) if 'plotName' in args: plot_total_contact_dist(ma, args) log.info("Saving diagnostic plot {}\n".format(args.plotName)) return log.info("matrix contains {} data points. Sparsity {:.3f}.".format( len(ma.matrix.data), float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2))) if args.skipDiagonal: ma.diagflat(value=0) total_filtered_out = set() if args.correctionMethod == 'ICE': if not args.filterThreshold: log.error('min and max filtering thresholds should be set') sys.exit(1) outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr) # compute and print some statistics pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0] ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) " "out of".format(pct_outlier, ma.matrix.shape[0]), restore_masked_bins=False) assert matrix_shape == ma.matrix.shape # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) assert type(coverage[0]) == np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False) ma.maskBins(failed_bins) total_filtered_out = set(failed_bins) """ ma.matrix, to_remove = fill_gaps(ma, failed_bins) log.warning("From {} failed bins, {} could " "not be filled\n".format(len(failed_bins), len(to_remove))) ma.maskBins(to_remove) """ if args.transCutoff and 0 < args.transCutoff < 100: cutoff = float(args.transCutoff) / 100 # a usual cutoff is 0.05 ma.truncTrans(high=cutoff) pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() correction_factors = [] corrected_matrix = lil_matrix(ma.matrix.shape) if args.perchr: # normalize each chromosome independently for chrname in list(ma.interval_trees): chr_range = ma.getChrBinRange(chrname) chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.correctionMethod == 'ICE': _matrix, _corr_factors = iterative_correction( chr_submatrix, args) corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix correction_factors.append(_corr_factors) else: # Set the kr matrix along with its correction factors vector assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing( chr_submatrix.shape[0], chr_submatrix.shape[1], chr_submatrix.count_nonzero(), chr_submatrix.indptr.astype(np.int64, copy=False), chr_submatrix.indices.astype(np.int64, copy=False), chr_submatrix.data.astype(np.float64, copy=False)) kr.computeKR() if args.outFileName.endswith('.h5'): corrected_matrix[ chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = kr.get_normalised_matrix( True) # correction_factors.append(np.true_divide(1, # kr.get_normalisation_vector(False).todense())) correction_factors.append( kr.get_normalisation_vector(False).todense()) correction_factors = np.concatenate(correction_factors) else: if args.correctionMethod == 'ICE': corrected_matrix, correction_factors = iterative_correction( ma.matrix, args) ma.setMatrixValues(corrected_matrix) else: assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1], ma.matrix.count_nonzero(), ma.matrix.indptr.astype(np.int64, copy=False), ma.matrix.indices.astype(np.int64, copy=False), ma.matrix.data.astype(np.float64, copy=False)) log.debug('passed pointers') kr.computeKR() log.debug('computation done') # set it to False since the vector is already normalised # with the previous True # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense()) correction_factors = kr.get_normalisation_vector(False).todense() if args.outFileName.endswith('.h5'): corrected_matrix = kr.get_normalised_matrix(True) if args.outFileName.endswith('.h5'): ma.setMatrixValues(corrected_matrix) # if ma.setCorrectionFactors(correction_factors) log.debug("Correction factors {}".format(correction_factors[:10])) if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE': after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten() # identify rows that were expanded more than args.inflationCutoff times to_remove = np.flatnonzero( after_row_sum / pre_row_sum >= args.inflationCutoff) ma.printchrtoremove(to_remove, label="inflated >={} " "regions".format(args.inflationCutoff), restore_masked_bins=False) total_filtered_out = total_filtered_out.union(to_remove) ma.maskBins(to_remove) ma.printchrtoremove(sorted(list(total_filtered_out)), label="Total regions to be removed", restore_masked_bins=False) ma.save(args.outFileName, pApplyCorrection=False)
def plot_total_contact_dist(hic_ma, args): """ Plots the distribution of number of contacts (excluding self contacts) Outliers with a high number are removed for the plot :param hic_ma: sparse matrix :return: """ use('Agg') majorlocator = MultipleLocator(1) majorformatter = FormatStrFormatter('%d') minorlocator = MultipleLocator(0.2) def plot_histogram(row_sum_values, mad_values, ax1, title=None): if args.xMax: ax1.set_xlim(ax1.get_xlim()[0], args.xMax) row_sum_values = row_sum_values[row_sum_values < args.xMax] ax1.set_xlabel("total counts per bin") ax1.set_ylabel("frequency") # ax1.xaxis.grid(True) ax1.patch.set_visible(False) dist, bin_s, __ = ax1.hist(row_sum_values, 100, color='green') # add second axis on top ax2 = ax1.twiny() ax2.set_xlabel("modified z-score") ax2.xaxis.set_major_locator(majorlocator) ax2.xaxis.set_major_formatter(majorformatter) ax2.xaxis.grid(True, which='minor') # for the minor ticks, use no labels; default NullFormatter ax2.xaxis.set_minor_locator(minorlocator) # update second axis values by mapping the min max # of the main axis to the translated values # into modified z score. # workaround for 'Axis limits cannot be NaN or Inf' bug in version 2.1.1 log.debug("ax1.get_xlim(): {}".format(ax1.get_xlim())) log.debug("np.array(ax1.get_xlim()): {}".format( np.array(ax1.get_xlim()))) log.debug( "mad_values.value_to_mad(np.array(ax1.get_xlim())): {}".format( mad_values.value_to_mad(np.array(ax1.get_xlim())))) ax2.set_xlim(mad_values.value_to_mad(np.array(ax1.get_xlim()))) # get first local mininum value local_min = [ x for x, y in enumerate(dist) if 1 <= x < len(dist) - 1 and dist[x - 1] > y < dist[x + 1] ] if len(local_min) > 0: threshold = bin_s[local_min[0]] else: threshold = None if threshold: mad_threshold = mad_values.value_to_mad(threshold) ymin, ymax = ax2.get_ylim() ax2.vlines(mad_threshold, ymin, ymax) if title: log.info("{}: mad threshold {}".format(title, mad_threshold)) else: log.info("mad threshold {}".format(mad_threshold)) # replace nan by 0 # hic_ma.matrix.data[np.isnan(hic_ma.matrix.data)] = 0 hic_ma.matrix = convertNansToZeros(hic_ma.matrix) hic_ma.matrix = convertInfsToZeros(hic_ma.matrix) if args.perchr: chroms = hic_ma.getChrNames() if len(chroms) > 30: log.warning("The matrix contains {} chromosomes. It is not " "practical to plot each. Try using --chromosomes to " "select some chromosomes or plot a single histogram.") num_rows = int(np.ceil(float(len(chroms)) / 5)) num_cols = min(len(chroms), 5) grids = gridspec.GridSpec(num_rows, num_cols) fig = plt.figure(figsize=(6 * num_cols, 5 * num_rows)) ax = {} for plot_num, chrname in enumerate(chroms): log.info("Plotting chromosome {}".format(chrname)) chr_range = hic_ma.getChrBinRange(chrname) chr_submatrix = hic_ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] row_sum = np.asarray(chr_submatrix.sum(axis=1)).flatten() row_sum = row_sum - chr_submatrix.diagonal() mad = MAD(row_sum) modified_z_score = mad.get_motified_zscores() # high remove outliers row_sum = row_sum[modified_z_score < 5] col = plot_num % num_cols row = plot_num // num_cols ax[chrname] = fig.add_subplot(grids[row, col]) plot_histogram(row_sum, mad, ax[chrname], title=chrname) ax[chrname].set_title(chrname) else: fig = plt.figure() row_sum = np.asarray(hic_ma.matrix.sum(axis=1)).flatten() row_sum = row_sum - hic_ma.matrix.diagonal() mad = MAD(row_sum) modified_z_score = mad.get_motified_zscores() # high remove outliers row_sum = row_sum[modified_z_score < 5] ax = fig.add_subplot(111) plot_histogram(row_sum, mad, ax) plt.tight_layout() plt.savefig(args.plotName) plt.close()
def main(args=None): args = parse_arguments().parse_args(args) if int(args.numberOfEigenvectors) != len(args.outputFileName): log.error( "Number of output file names and number of eigenvectors does not match. Please" "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}" .format(args.outputFileName, args.numberOfEigenvectors)) exit(1) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) log.debug("Computing pca for chromosome: {}".format(chrname)) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix, length_chromosome, chromosome_count) exp_obs_matrix_ = convertNansToZeros( csr_matrix(exp_obs_matrix_)).todense() exp_obs_matrix_ = convertInfsToZeros( csr_matrix(exp_obs_matrix_)).todense() pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) k = args.numberOfEigenvectors chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) vecs_list += eigs[:, :k].tolist() chrom_list += chrom start_list += start end_list += end if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): assert (len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format( toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) elif args.format == 'bigwig': if not pyBigWig.numpy == 1: log.error( "ERROR: Your version of pyBigWig is not supporting numpy: {}". format(pyBigWig.__file__)) exit(1) old_chrom = chrom_list[0] header = [] for i, chrom_ in enumerate(chrom_list): if old_chrom != chrom_: header.append((toString(old_chrom), end_list[i - 1])) old_chrom = chrom_ header.append((toString(chrom_list[-1]), end_list[-1])) for idx, outfile in enumerate(args.outputFileName): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) assert (len(vecs_list) == len(chrom_list)) chrom_list_ = [] start_list_ = [] end_list_ = [] values = [] bw = pyBigWig.open(outfile, 'w') # set big wig header bw.addHeader(header) # create entry lists for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real values.append(value[idx]) chrom_list_.append(toString(chrom_list[i])) start_list_.append(start_list[i]) end_list_.append(end_list[i]) # write entries bw.addEntries(chrom_list_, start_list_, ends=end_list_, values=values) bw.close() else: log.error("Output format not known: {}".format(args.format)) exit(1)
def _obs_exp_non_zero(pSubmatrix): obs_exp_matrix_ = obs_exp_matrix_non_zero(pSubmatrix) obs_exp_matrix_ = convertNansToZeros(csr_matrix(obs_exp_matrix_)) obs_exp_matrix_ = convertInfsToZeros(csr_matrix(obs_exp_matrix_)).todense() return obs_exp_matrix_
def main(args=None): args = parse_arguments().parse_args(args) if args.verbose: log.setLevel(logging.INFO) # args.chromosomes if check_cooler(args.matrix) and args.chromosomes is not None and len(args.chromosomes) == 1: ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes)) else: ma = hm.hiCMatrix(args.matrix) if args.chromosomes: ma.reorderChromosomes(toString(args.chromosomes)) # mask all zero value bins row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape ma.matrix = convertNansToZeros(ma.matrix) ma.matrix = convertInfsToZeros(ma.matrix) if 'plotName' in args: plot_total_contact_dist(ma, args) log.info("Saving diagnostic plot {}\n".format(args.plotName)) return log.info("matrix contains {} data points. Sparsity {:.3f}.".format( len(ma.matrix.data), float(len(ma.matrix.data)) / (ma.matrix.shape[0] ** 2))) if args.skipDiagonal: ma.diagflat(value=0) outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr) # compute and print some statistics pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0] ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) " "out of".format(pct_outlier, ma.matrix.shape[0]), restore_masked_bins=False) assert matrix_shape == ma.matrix.shape # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) assert type(coverage[0]) == np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False) ma.maskBins(failed_bins) total_filtered_out = set(failed_bins) """ ma.matrix, to_remove = fill_gaps(ma, failed_bins) log.warning("From {} failed bins, {} could " "not be filled\n".format(len(failed_bins), len(to_remove))) ma.maskBins(to_remove) """ if args.transCutoff and 0 < args.transCutoff < 100: cutoff = float(args.transCutoff) / 100 # a usual cutoff is 0.05 ma.truncTrans(high=cutoff) pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() correction_factors = [] if args.perchr: corrected_matrix = lil_matrix(ma.matrix.shape) # normalize each chromosome independently for chrname in list(ma.interval_trees): chr_range = ma.getChrBinRange(chrname) chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] _matrix, _corr_factors = iterative_correction(chr_submatrix, args) corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix correction_factors.append(_corr_factors) correction_factors = np.concatenate(correction_factors) else: corrected_matrix, correction_factors = iterative_correction(ma.matrix, args) ma.setMatrixValues(corrected_matrix) ma.setCorrectionFactors(correction_factors) log.info("Correction factors {}".format(correction_factors[:10])) if args.inflationCutoff and args.inflationCutoff > 0: after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten() # identify rows that were expanded more than args.inflationCutoff times to_remove = np.flatnonzero(after_row_sum / pre_row_sum >= args.inflationCutoff) ma.printchrtoremove(to_remove, label="inflated >={} " "regions".format(args.inflationCutoff), restore_masked_bins=False) total_filtered_out = total_filtered_out.union(to_remove) ma.maskBins(to_remove) ma.printchrtoremove(sorted(list(total_filtered_out)), label="Total regions to be removed", restore_masked_bins=False) ma.save(args.outFileName, pApplyCorrection=False)
def open_and_store_matrix(pMatrixName, pMatricesList, pIndex, pXDimension, pChromosomes, pNorm, pExtraTrack, pHistonMarkType, pBinarization, pQueue): compartments_matrix = None for i, matrix in enumerate(pMatricesList): ma = hm.hiCMatrix(pMatrixName + '::' + matrix) # WARNING # DO NOT APPLY BIN MASKING, WILL LEAD TO DIFFERENT SIZES OF THE CHROMOSOMES # THIS IS CAUSING A FAIL OF THE COMPUTATION # ma.maskBins(ma.nan_bins) k = 1 if pChromosomes: ma.keepOnlyTheseChr(pChromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] if pExtraTrack and (pExtraTrack.endswith('.bw') or pExtraTrack.endswith('.bigwig')): bwTrack = pyBigWig.open(pExtraTrack, 'r') for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if pNorm: obs_exp_matrix_ = obs_exp_matrix_norm(submatrix) else: obs_exp_matrix_ = obs_exp_matrix_lieberman( submatrix, length_chromosome, chromosome_count) obs_exp_matrix_ = convertNansToZeros( csr_matrix(obs_exp_matrix_)).todense() obs_exp_matrix_ = convertInfsToZeros( csr_matrix(obs_exp_matrix_)).todense() pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) chrom_list += chrom start_list += start end_list += end if pExtraTrack and (pExtraTrack.endswith('.bw') or pExtraTrack.endswith('.bigwig')): assert (len(end) == len(start)) correlateEigenvectorWithHistonMarkTrack( eigs[:, :k].transpose(), bwTrack, chrname, start, end, pExtraTrack, pHistonMarkType) vecs_list += eigs[:, :k].tolist() if compartments_matrix is None: compartments_matrix = np.zeros( [pXDimension, len(np.array(vecs_list).flatten())], dtype=np.float) eigenvector = np.real(np.array(vecs_list).flatten()) mask = np.isnan(eigenvector) if len(mask) > 0: eigenvector[mask] = 0 mask = np.isinf(eigenvector) if len(mask) > 0: eigenvector[mask] = 0 if pBinarization: mask = eigenvector <= 0 eigenvector[mask] = -1 mask = eigenvector > 0 eigenvector[mask] = 1 compartments_matrix[pIndex + i, :] = eigenvector pQueue.put(compartments_matrix) return