def diff_ent(cool_uri_1, cool_uri_2, bed_path, output, inner_window, balance, gap_ratio, processes, chunk_size): """ \b Args ---- cool_uri_1 : str URI of input cool container 1. cool_uri_2 : str URI of input cool container 2. bed_path : str Path to input BED. output : str Path to output BEDGRAPH file. """ c1 = Cooler(cool_uri_1) matrix_selector_1 = MatrixSelector(c1, balance=balance) c2 = Cooler(cool_uri_2) matrix_selector_2 = MatrixSelector(c2, balance=balance) regions = read_bed(bed_path) chunks = chunking(regions, chunk_size) if os.path.exists(output): subprocess.check_call(['rm', output]) with ProcessPoolExecutor(max_workers=processes) as excuter: map_ = map if processes == 1 else excuter.map for out_chunk in map_(process_region_chunk_diff_ent, chunks, repeat(matrix_selector_1), repeat(matrix_selector_2), repeat(gap_ratio)): write_result(out_chunk, output, mode='a')
def __init__(self, source_uri, bins, chunksize, batchsize, map=map): from cooler.api import Cooler self._map = map self.source_uri = source_uri self.chunksize = chunksize self.batchsize = batchsize clr = Cooler(source_uri) self._size = clr.info['nnz'] self.old_binsize = clr.binsize self.old_chrom_offset = clr._load_dset('indexes/chrom_offset') self.old_bin1_offset = clr._load_dset('indexes/bin1_offset') self.gs = GenomeSegmentation(clr.chromsizes, bins) self.new_binsize = get_binsize(bins) assert self.new_binsize % self.old_binsize == 0 self.factor = self.new_binsize // self.old_binsize
def region(cool_uri, bed_path, output, inner_window, balance, coverage, processes, chunk_size): """ \b Args ---- cool_uri : str URI of input cool container. bed_path : str Path to input BED. output : str Path to output BEDGRAPH file. """ c = Cooler(cool_uri) matrix_selector = MatrixSelector(c, balance=balance) regions = read_bed(bed_path) chunks = chunking(regions, chunk_size) if os.path.exists(output): subprocess.check_call(['rm', output]) with ProcessPoolExecutor(max_workers=processes) as excuter: map_ = map if processes == 1 else excuter.map for out_chunk in map_(process_region_chunk, chunks, repeat(matrix_selector), repeat(coverage)): bgs = filter_abnormal(out_chunk) write_bedgraph(bgs, output, mode='a')
def _aggregate(self, span): from cooler.api import Cooler lo, hi = span logger.info('{} {}'.format(lo, hi)) try: lock.acquire() with h5py.File(self.cooler_path, 'r') as h5: c = Cooler(h5[self.cooler_root]) # convert_enum=False should return chroms as int table = c.pixels(join=True, convert_enum=False) chunk = table[lo:hi] #chunk['chrom1'] = pandas.Categorical(chunk['chrom1'], categories=self.chroms) #chunk['chrom2'] = pandas.Categorical(chunk['chrom2'], categories=self.chroms) finally: lock.release() # use the "start" point as anchor for re-binning # XXX - alternatives: midpoint anchor, proportional re-binning binsize = self.gs.binsize chrom_binoffset = self.gs.chrom_binoffset chrom_abspos = self.gs.chrom_abspos start_abspos = self.gs.start_abspos chrom_id1 = chunk['chrom1'].values #.cat.codes.values chrom_id2 = chunk['chrom2'].values #.cat.codes.values start1 = chunk['start1'].values start2 = chunk['start2'].values if binsize is None: abs_start1 = chrom_abspos[chrom_id1] + start1 abs_start2 = chrom_abspos[chrom_id2] + start2 chunk['bin1_id'] = np.searchsorted( start_abspos, abs_start1, side='right') - 1 chunk['bin2_id'] = np.searchsorted( start_abspos, abs_start2, side='right') - 1 else: rel_bin1 = np.floor(start1 / binsize).astype(int) rel_bin2 = np.floor(start2 / binsize).astype(int) chunk['bin1_id'] = chrom_binoffset[chrom_id1] + rel_bin1 chunk['bin2_id'] = chrom_binoffset[chrom_id2] + rel_bin2 grouped = chunk.groupby(['bin1_id', 'bin2_id'], sort=False) return grouped['count'].sum().reset_index()
def loci(cool_uri, output, window_size, overlap, inner_window, outer_window, balance, subtract_expect, coverage, processes, chunk_size): """ \b inner window |-| |--------------------------------------------------------| |- -| outer window \b Args ---- cool_uri : str URI of input cool container. output : str Path to output BEDGRAPH file. """ c = Cooler(cool_uri) resolution = c.info['bin-size'] chromsizes = c.chromsizes.to_dict() chromsizes = { chr_: (size // resolution) + 1 for chr_, size in chromsizes.items() } chr_chunks = chromosome_chunks(chromsizes, window_size, overlap, chunk_size) it1, it2 = tee(chr_chunks) # split chr_chunks to chroms and chunks chroms = map(operator.itemgetter(0), it1) chunks = map(operator.itemgetter(1), it2) matrix_selector = MatrixSelector(c, balance=balance) if os.path.exists(output): subprocess.check_call(['rm', output]) with ProcessPoolExecutor(max_workers=processes) as excuter: map_ = map if processes == 1 else excuter.map tmp_file = output + ".tmp" idx = 0 args = repeat(matrix_selector), chroms, chunks, repeat( inner_window), repeat(outer_window), repeat( subtract_expect), repeat(coverage) for out_chunk in map_(process_loci_chunk, *args): print("chunk {}: {}/{} blocks".format(idx, len(out_chunk), chunk_size)) write_bedgraph(out_chunk, tmp_file, mode='a') idx += 1 sorted_lines = sort_bedGraph(tmp_file) bgs = parse_bedgraph(sorted_lines) non_overlap = eliminate_overlap(bgs, window_size, overlap, resolution) write_bedgraph(non_overlap, output) subprocess.check_call(['rm', output + ".tmp"]) # rm merged tmp file
def _aggregate(self, span): from cooler.api import Cooler lo, hi = span clr = Cooler(self.source_uri) # convert_enum=False returns chroms as raw ints table = clr.pixels(join=True, convert_enum=False) chunk = table[lo:hi] logger.info('{} {}'.format(lo, hi)) # use the "start" point as anchor for re-binning # XXX - alternatives: midpoint anchor, proportional re-binning binsize = self.gs.binsize chrom_binoffset = self.gs.chrom_binoffset chrom_abspos = self.gs.chrom_abspos start_abspos = self.gs.start_abspos chrom_id1 = chunk['chrom1'].values chrom_id2 = chunk['chrom2'].values start1 = chunk['start1'].values start2 = chunk['start2'].values if binsize is None: abs_start1 = chrom_abspos[chrom_id1] + start1 abs_start2 = chrom_abspos[chrom_id2] + start2 chunk['bin1_id'] = np.searchsorted( start_abspos, abs_start1, side='right') - 1 chunk['bin2_id'] = np.searchsorted( start_abspos, abs_start2, side='right') - 1 else: rel_bin1 = np.floor(start1 / binsize).astype(int) rel_bin2 = np.floor(start2 / binsize).astype(int) chunk['bin1_id'] = chrom_binoffset[chrom_id1] + rel_bin1 chunk['bin2_id'] = chrom_binoffset[chrom_id2] + rel_bin2 grouped = chunk.groupby(['bin1_id', 'bin2_id'], sort=False) return grouped['count'].sum().reset_index()
def stats_v4c(bed, cool_uri, h5group_uri, inner_window, up_stream, down_stream, balance, processes): """ Compute the value matrix in bigwig around start position in bed file. \b Args ---- bed : str Path to input bed file. cool_uri : str URI to cool. h5group_uri : str URI of output HDF5 file group, like: ./test.h5 ./test.h5::/virtual4c/ """ path, group = split_uri(h5group_uri) if not os.path.exists(path): h5py.File(path).close() # create, if file not exist. df = read_bed_df(bed) global num_records num_records = df.shape[0] # for create progress bar dataframe_to_hdf5(df, h5group_uri, "ref_bed") bed_recs = read_bed(bed) cool = Cooler(cool_uri) mat_sel = MatrixSelector(cool, balance=balance) def iterover_fetch_scores(iter): chrs, ref_pos = tee(iter) chrs = (rec[0] for rec in chrs) ref_pos = (rec[1] for rec in ref_pos) map_ = ProcessPoolExecutor( max_workers=processes).map if processes > 1 else map args = (repeat(mat_sel), chrs, ref_pos, repeat(inner_window), repeat(up_stream), repeat(down_stream)) for scores in map_(count_range, *args): yield scores scores_iter = iterover_fetch_scores(bed_recs) incremental_chunk_size = 20 scores_iter_to_hdf5(scores_iter, h5group_uri, "matrix", incremental_chunk_size) write_meta_info(h5group_uri, bed, cool_uri, inner_window, up_stream, down_stream)
def prepare_cool(url=COOL_URL): import os import re import wget from os.path import split from cooler.api import Cooler log.info(f"download cool file from {url}") down_dir = mk_dir(DOWNLOAD_DIR) cool_file = split(url)[-1] cool_path = str(down_dir / cool_file) wget.download(url, cool_path) log.info("Zoomify cool") c = Cooler(cool_path) resos = [str(r) for r in RESOLUTIONS if r >= c.binsize] check_call(["cooler", "zoomify", "--balance", "-p", "30", "-r", ",".join(resos), cool_path]) target = MCOOL if os.path.exists(target): os.unlink(target) mcool_path = re.sub(".cool$", ".mcool", cool_path) os.symlink(mcool_path, target)
def create_from_unordered(cool_uri, bins, chunks, columns=None, dtype=None, mergebuf=int(20e6), delete_temp=True, temp_dir=None, multifile_merge=False, **kwargs): """ Create a Cooler in two passes via an external sort mechanism. In the first pass, a sequence of data chunks are processed and sorted in memory and saved to temporary Coolers. In the second pass, the temporary Coolers are merged into the output. This way the individual chunks do not need to be provided in any particular order. Parameters ---------- cool_uri : str Path to Cooler file or URI to Cooler group. If the file does not exist, it will be created. bins : DataFrame Segmentation of the chromosomes into genomic bins. May contain additional columns. chunks : iterable of DataFrames Sequence of chunks that get processed and written to separate Coolers and then subsequently merged. columns : sequence of str, optional Specify here the names of any additional value columns from the input besides 'count' to store in the Cooler. The standard columns ['bin1_id', 'bin2_id', 'count'] can be provided, but are already assumed and don't need to be given explicitly. Additional value columns provided here will be stored as np.float64 unless otherwised specified using `dtype`. dtype : dict, optional Dictionary mapping column names in the pixel table to dtypes. Can be used to override the default dtypes of 'bin1_id', 'bin2_id' or 'count'. Any additional value column dtypes must also be provided in the `columns` argument, or will be ignored. mergebuf : int, optional Maximum number of records to buffer in memory at any give time during the merge step. delete_temp : bool, optional Whether to delete temporary files when finished. Useful for debugging. Default is False. temp_dir : str, optional Create temporary files in this directory. metadata : dict, optional Experiment metadata to store in the file. Must be JSON compatible. assembly : str, optional Name of genome assembly. h5opts : dict, optional HDF5 dataset filter options to use (compression, shuffling, checksumming, etc.). Default is to use autochunking and GZIP compression, level 6. append : bool, optional Append new Cooler to the file if it exists. If False, an existing file with the same name will be truncated. Default is False. lock : multiprocessing.Lock, optional Optional lock to control concurrent access to the output file. See also -------- sanitize_records sanitize_pixels """ bins = bins.copy() bins['chrom'] = bins['chrom'].astype(object) if dtype is None and 'dtypes' in kwargs: dtype = kwargs.pop('dtypes') tf = tempfile.NamedTemporaryFile(suffix='.multi.cool', delete=delete_temp, dir=temp_dir) uris = [] for i, chunk in enumerate(chunks): uri = tf.name + '::' + str(i) uris.append(uri) log.info('Writing chunk {}: {}'.format(i, uri)) create(uri, bins, chunk, columns=columns, dtype=dtype, append=True, boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False) chunks = CoolerMerger([Cooler(uri) for uri in uris], mergebuf) log.info('Merging into {}'.format(cool_uri)) create(cool_uri, bins, chunks, columns=columns, dtype=dtype, **kwargs)
def balance(cool_uri, nproc=1, chunksize=int(1e7), mad_max=5, min_nnz=10, min_count=0, ignore_diags=1, tol=1e-5, max_iters=200): """ Cooler contact matrix balancing. Parameters ---------- cool_uri : str URI of cooler group. nproc : int Number of processes. (Default: 1) """ cool_path, group_path = parse_cooler_uri(cool_uri) # pre-check the weight column with h5py.File(cool_path, 'r') as h5: grp = h5[group_path] if 'weight' in grp['bins']: del grp['bins']['weight'] # Overwrite the weight column log.info('Balancing {0}'.format(cool_uri)) clr = Cooler(cool_uri) try: if nproc > 1: pool = Pool(nproc) map_ = pool.imap_unordered else: map_ = map if clr.info['metadata']['onlyIntra'] == 'True': onlyIntra = True else: onlyIntra = False bias, stats = ice.iterative_correction(clr, chunksize=chunksize, cis_only=onlyIntra, trans_only=False, tol=tol, min_nnz=min_nnz, min_count=min_count, blacklist=None, mad_max=mad_max, max_iters=max_iters, ignore_diags=ignore_diags, rescale_marginals=True, use_lock=False, map=map_) finally: if nproc > 1: pool.close() if not stats['converged']: log.error('Iteration limit reached without convergence') log.error('Storing final result. Check log to assess convergence.') with h5py.File(cool_path, 'r+') as h5: grp = h5[group_path] # add the bias column to the file h5opts = dict(compression='gzip', compression_opts=6) grp['bins'].create_dataset('weight', data=bias, **h5opts) grp['bins']['weight'].attrs.update(stats)
def create_from_unordered(cool_uri, bins, chunks, columns=None, dtypes=None, mergebuf=int(20e6), delete_temp=True, temp_dir=None, **kwargs): """ Create a Cooler in two passes via an external sort mechanism. In the first pass, a sequence of data chunks are processed and sorted in memory and saved to temporary Coolers. In the second pass, the temporary Coolers are merged into the output. This way the individual chunks do not need to be provided in any particular order. Parameters ---------- cool_uri : str Path to Cooler file or URI to Cooler group. If the file does not exist, it will be created. bins : DataFrame Segmentation of the chromosomes into genomic bins. May contain additional columns. chunks : iterable of DataFrames Sequence of chunks that get processed and written to separate Coolers and then subsequently merged. columns : sequence of str, optional Specify here the names of any additional value columns from the input besides 'count' to store in the Cooler. The standard columns ['bin1_id', 'bin2_id', 'count'] can be provided, but are already assumed and don't need to be given explicitly. Additional value columns provided here will be stored as np.float64 unless otherwised specified using `dtype`. dtypes : dict, optional Dictionary mapping column names to dtypes. Can be used to override the default dtypes of ``bin1_id``, ``bin2_id`` or ``count`` or assign dtypes to custom value columns. Non-standard value columns given in ``dtypes`` must also be provided in the ``columns`` argument or they will be ignored. assembly : str, optional Name of genome assembly. mode : {'w' , 'a'}, optional [default: 'w'] Write mode for the output file. 'a': if the output file exists, append the new cooler to it. 'w': if the output file exists, it will be truncated. Default is 'w'. metadata : dict, optional Experiment metadata to store in the file. Must be JSON compatible. mergebuf : int, optional Maximum number of records to buffer in memory at any give time during the merge step. delete_temp : bool, optional Whether to delete temporary files when finished. Useful for debugging. Default is False. temp_dir : str, optional Create temporary files in this directory. See also -------- sanitize_records sanitize_pixels """ bins = bins.copy() bins['chrom'] = bins['chrom'].astype(object) tf = tempfile.NamedTemporaryFile( suffix='.multi.cool', delete=delete_temp, dir=temp_dir) uris = [] for i, chunk in enumerate(chunks): uri = tf.name + '::' + str(i) uris.append(uri) log.info('Writing chunk {}: {}'.format(i, uri)) create_cooler(uri, bins, chunk, columns=columns, mode='a', boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False, ordered=True, dtypes=dtypes) chunks = CoolerMerger([Cooler(uri) for uri in uris], mergebuf) log.info('Merging into {}'.format(cool_uri)) create_cooler(cool_uri, bins, chunks, columns=columns, dtypes=dtypes, ordered=True, **kwargs)