Exemple #1
0
def diff_ent(cool_uri_1, cool_uri_2, bed_path, output, inner_window, balance, gap_ratio, processes, chunk_size):
    """
    \b
    Args
    ----
    cool_uri_1 : str
        URI of input cool container 1.
    cool_uri_2 : str
        URI of input cool container 2.
    bed_path : str
        Path to input BED.
    output : str
        Path to output BEDGRAPH file.
    """
    c1 = Cooler(cool_uri_1)
    matrix_selector_1 = MatrixSelector(c1, balance=balance)
    c2 = Cooler(cool_uri_2)
    matrix_selector_2 = MatrixSelector(c2, balance=balance)
    
    regions = read_bed(bed_path)
    chunks = chunking(regions, chunk_size)
    if os.path.exists(output):
        subprocess.check_call(['rm', output])
    with ProcessPoolExecutor(max_workers=processes) as excuter:
        map_ = map if processes == 1 else excuter.map
        for out_chunk in map_(process_region_chunk_diff_ent, chunks, repeat(matrix_selector_1), repeat(matrix_selector_2), repeat(gap_ratio)):
            write_result(out_chunk, output, mode='a')
Exemple #2
0
    def __init__(self, source_uri, bins, chunksize, batchsize, map=map):
        from cooler.api import Cooler
        self._map = map
        self.source_uri = source_uri
        self.chunksize = chunksize
        self.batchsize = batchsize

        clr = Cooler(source_uri)
        self._size = clr.info['nnz']
        self.old_binsize = clr.binsize
        self.old_chrom_offset = clr._load_dset('indexes/chrom_offset')
        self.old_bin1_offset = clr._load_dset('indexes/bin1_offset')
        self.gs = GenomeSegmentation(clr.chromsizes, bins)
        self.new_binsize = get_binsize(bins)
        assert self.new_binsize % self.old_binsize == 0
        self.factor = self.new_binsize // self.old_binsize
Exemple #3
0
def region(cool_uri, bed_path, output, inner_window, balance, coverage,
           processes, chunk_size):
    """
    \b
    Args
    ----
    cool_uri : str
        URI of input cool container.
    bed_path : str
        Path to input BED.
    output : str
        Path to output BEDGRAPH file.
    """
    c = Cooler(cool_uri)
    matrix_selector = MatrixSelector(c, balance=balance)
    regions = read_bed(bed_path)
    chunks = chunking(regions, chunk_size)
    if os.path.exists(output):
        subprocess.check_call(['rm', output])
    with ProcessPoolExecutor(max_workers=processes) as excuter:
        map_ = map if processes == 1 else excuter.map
        for out_chunk in map_(process_region_chunk, chunks,
                              repeat(matrix_selector), repeat(coverage)):
            bgs = filter_abnormal(out_chunk)
            write_bedgraph(bgs, output, mode='a')
Exemple #4
0
    def _aggregate(self, span):
        from cooler.api import Cooler
        lo, hi = span
        logger.info('{} {}'.format(lo, hi))

        try:
            lock.acquire()
            with h5py.File(self.cooler_path, 'r') as h5:
                c = Cooler(h5[self.cooler_root])
                # convert_enum=False should return chroms as int
                table = c.pixels(join=True, convert_enum=False)
                chunk = table[lo:hi]
                #chunk['chrom1'] = pandas.Categorical(chunk['chrom1'], categories=self.chroms)
                #chunk['chrom2'] = pandas.Categorical(chunk['chrom2'], categories=self.chroms)
        finally:
            lock.release()

        # use the "start" point as anchor for re-binning
        # XXX - alternatives: midpoint anchor, proportional re-binning
        binsize = self.gs.binsize
        chrom_binoffset = self.gs.chrom_binoffset
        chrom_abspos = self.gs.chrom_abspos
        start_abspos = self.gs.start_abspos

        chrom_id1 = chunk['chrom1'].values  #.cat.codes.values
        chrom_id2 = chunk['chrom2'].values  #.cat.codes.values
        start1 = chunk['start1'].values
        start2 = chunk['start2'].values
        if binsize is None:
            abs_start1 = chrom_abspos[chrom_id1] + start1
            abs_start2 = chrom_abspos[chrom_id2] + start2
            chunk['bin1_id'] = np.searchsorted(
                start_abspos, abs_start1, side='right') - 1
            chunk['bin2_id'] = np.searchsorted(
                start_abspos, abs_start2, side='right') - 1
        else:
            rel_bin1 = np.floor(start1 / binsize).astype(int)
            rel_bin2 = np.floor(start2 / binsize).astype(int)
            chunk['bin1_id'] = chrom_binoffset[chrom_id1] + rel_bin1
            chunk['bin2_id'] = chrom_binoffset[chrom_id2] + rel_bin2

        grouped = chunk.groupby(['bin1_id', 'bin2_id'], sort=False)
        return grouped['count'].sum().reset_index()
Exemple #5
0
def loci(cool_uri, output, window_size, overlap, inner_window, outer_window,
         balance, subtract_expect, coverage, processes, chunk_size):
    """

    \b
                         inner window
                         |-|
    |--------------------------------------------------------|
         |-                                 -|
                outer window
    \b
    Args
    ----
    cool_uri : str
        URI of input cool container.
    output : str
        Path to output BEDGRAPH file.
    """
    c = Cooler(cool_uri)
    resolution = c.info['bin-size']
    chromsizes = c.chromsizes.to_dict()
    chromsizes = {
        chr_: (size // resolution) + 1
        for chr_, size in chromsizes.items()
    }

    chr_chunks = chromosome_chunks(chromsizes, window_size, overlap,
                                   chunk_size)
    it1, it2 = tee(chr_chunks)  # split chr_chunks to chroms and chunks
    chroms = map(operator.itemgetter(0), it1)
    chunks = map(operator.itemgetter(1), it2)

    matrix_selector = MatrixSelector(c, balance=balance)

    if os.path.exists(output):
        subprocess.check_call(['rm', output])

    with ProcessPoolExecutor(max_workers=processes) as excuter:
        map_ = map if processes == 1 else excuter.map
        tmp_file = output + ".tmp"
        idx = 0
        args = repeat(matrix_selector), chroms, chunks, repeat(
            inner_window), repeat(outer_window), repeat(
                subtract_expect), repeat(coverage)
        for out_chunk in map_(process_loci_chunk, *args):
            print("chunk {}: {}/{} blocks".format(idx, len(out_chunk),
                                                  chunk_size))
            write_bedgraph(out_chunk, tmp_file, mode='a')
            idx += 1

    sorted_lines = sort_bedGraph(tmp_file)
    bgs = parse_bedgraph(sorted_lines)
    non_overlap = eliminate_overlap(bgs, window_size, overlap, resolution)
    write_bedgraph(non_overlap, output)
    subprocess.check_call(['rm', output + ".tmp"])  # rm merged tmp file
Exemple #6
0
    def _aggregate(self, span):
        from cooler.api import Cooler
        lo, hi = span

        clr = Cooler(self.source_uri)
        # convert_enum=False returns chroms as raw ints
        table = clr.pixels(join=True, convert_enum=False)
        chunk = table[lo:hi]
        logger.info('{} {}'.format(lo, hi))

        # use the "start" point as anchor for re-binning
        # XXX - alternatives: midpoint anchor, proportional re-binning
        binsize = self.gs.binsize
        chrom_binoffset = self.gs.chrom_binoffset
        chrom_abspos = self.gs.chrom_abspos
        start_abspos = self.gs.start_abspos

        chrom_id1 = chunk['chrom1'].values
        chrom_id2 = chunk['chrom2'].values
        start1 = chunk['start1'].values
        start2 = chunk['start2'].values
        if binsize is None:
            abs_start1 = chrom_abspos[chrom_id1] + start1
            abs_start2 = chrom_abspos[chrom_id2] + start2
            chunk['bin1_id'] = np.searchsorted(
                start_abspos, abs_start1, side='right') - 1
            chunk['bin2_id'] = np.searchsorted(
                start_abspos, abs_start2, side='right') - 1
        else:
            rel_bin1 = np.floor(start1 / binsize).astype(int)
            rel_bin2 = np.floor(start2 / binsize).astype(int)
            chunk['bin1_id'] = chrom_binoffset[chrom_id1] + rel_bin1
            chunk['bin2_id'] = chrom_binoffset[chrom_id2] + rel_bin2

        grouped = chunk.groupby(['bin1_id', 'bin2_id'], sort=False)
        return grouped['count'].sum().reset_index()
Exemple #7
0
def stats_v4c(bed, cool_uri, h5group_uri, inner_window, up_stream, down_stream,
              balance, processes):
    """
    Compute the value matrix in bigwig around start position in bed file.

    \b
    Args
    ----
    bed : str
        Path to input bed file.
    cool_uri : str
        URI to cool.
    h5group_uri : str
        URI of output HDF5 file group, like:
            ./test.h5
            ./test.h5::/virtual4c/
    """
    path, group = split_uri(h5group_uri)
    if not os.path.exists(path):
        h5py.File(path).close()  # create, if file not exist.
    df = read_bed_df(bed)
    global num_records
    num_records = df.shape[0]  # for create progress bar
    dataframe_to_hdf5(df, h5group_uri, "ref_bed")
    bed_recs = read_bed(bed)
    cool = Cooler(cool_uri)
    mat_sel = MatrixSelector(cool, balance=balance)

    def iterover_fetch_scores(iter):
        chrs, ref_pos = tee(iter)
        chrs = (rec[0] for rec in chrs)
        ref_pos = (rec[1] for rec in ref_pos)
        map_ = ProcessPoolExecutor(
            max_workers=processes).map if processes > 1 else map
        args = (repeat(mat_sel), chrs, ref_pos, repeat(inner_window),
                repeat(up_stream), repeat(down_stream))
        for scores in map_(count_range, *args):
            yield scores

    scores_iter = iterover_fetch_scores(bed_recs)
    incremental_chunk_size = 20
    scores_iter_to_hdf5(scores_iter, h5group_uri, "matrix",
                        incremental_chunk_size)
    write_meta_info(h5group_uri, bed, cool_uri, inner_window, up_stream,
                    down_stream)
Exemple #8
0
def prepare_cool(url=COOL_URL):
    import os
    import re
    import wget
    from os.path import split
    from cooler.api import Cooler
    log.info(f"download cool file from {url}")
    down_dir = mk_dir(DOWNLOAD_DIR)
    cool_file = split(url)[-1]
    cool_path = str(down_dir / cool_file)
    wget.download(url, cool_path)

    log.info("Zoomify cool")
    c = Cooler(cool_path)
    resos = [str(r) for r in RESOLUTIONS if r >= c.binsize]
    check_call(["cooler", "zoomify", "--balance", "-p", "30", "-r", ",".join(resos), cool_path])
    target = MCOOL
    if os.path.exists(target):
        os.unlink(target)
    mcool_path = re.sub(".cool$", ".mcool", cool_path)
    os.symlink(mcool_path, target)
Exemple #9
0
def create_from_unordered(cool_uri,
                          bins,
                          chunks,
                          columns=None,
                          dtype=None,
                          mergebuf=int(20e6),
                          delete_temp=True,
                          temp_dir=None,
                          multifile_merge=False,
                          **kwargs):
    """
    Create a Cooler in two passes via an external sort mechanism. In the first 
    pass, a sequence of data chunks are processed and sorted in memory and saved
    to temporary Coolers. In the second pass, the temporary Coolers are merged 
    into the output. This way the individual chunks do not need to be provided
    in any particular order.
    
    Parameters
    ----------
    cool_uri : str
        Path to Cooler file or URI to Cooler group. If the file does not exist,
        it will be created.
    bins : DataFrame
        Segmentation of the chromosomes into genomic bins. May contain 
        additional columns.
    chunks : iterable of DataFrames
        Sequence of chunks that get processed and written to separate Coolers 
        and then subsequently merged.
    columns : sequence of str, optional
        Specify here the names of any additional value columns from the input 
        besides 'count' to store in the Cooler. The standard columns ['bin1_id', 
        'bin2_id', 'count'] can be provided, but are already assumed and don't 
        need to be given explicitly. Additional value columns provided here will 
        be stored as np.float64 unless otherwised specified using `dtype`.
    dtype : dict, optional
        Dictionary mapping column names in the pixel table to dtypes. Can be 
        used to override the default dtypes of 'bin1_id', 'bin2_id' or 'count'. 
        Any additional value column dtypes must also be provided in the
        `columns` argument, or will be ignored.
    mergebuf : int, optional
        Maximum number of records to buffer in memory at any give time during 
        the merge step.
    delete_temp : bool, optional
        Whether to delete temporary files when finished. 
        Useful for debugging. Default is False.
    temp_dir : str, optional
        Create temporary files in this directory.
    metadata : dict, optional
        Experiment metadata to store in the file. Must be JSON compatible.
    assembly : str, optional
        Name of genome assembly.
    h5opts : dict, optional
        HDF5 dataset filter options to use (compression, shuffling,
        checksumming, etc.). Default is to use autochunking and GZIP
        compression, level 6.
    append : bool, optional
        Append new Cooler to the file if it exists. If False, an existing file
        with the same name will be truncated. Default is False.
    lock : multiprocessing.Lock, optional
        Optional lock to control concurrent access to the output file.

    See also
    --------
    sanitize_records
    sanitize_pixels

    """
    bins = bins.copy()
    bins['chrom'] = bins['chrom'].astype(object)

    if dtype is None and 'dtypes' in kwargs:
        dtype = kwargs.pop('dtypes')

    tf = tempfile.NamedTemporaryFile(suffix='.multi.cool',
                                     delete=delete_temp,
                                     dir=temp_dir)

    uris = []
    for i, chunk in enumerate(chunks):
        uri = tf.name + '::' + str(i)
        uris.append(uri)
        log.info('Writing chunk {}: {}'.format(i, uri))
        create(uri,
               bins,
               chunk,
               columns=columns,
               dtype=dtype,
               append=True,
               boundscheck=False,
               triucheck=False,
               dupcheck=False,
               ensure_sorted=False)

    chunks = CoolerMerger([Cooler(uri) for uri in uris], mergebuf)

    log.info('Merging into {}'.format(cool_uri))
    create(cool_uri, bins, chunks, columns=columns, dtype=dtype, **kwargs)
Exemple #10
0
def balance(cool_uri,
            nproc=1,
            chunksize=int(1e7),
            mad_max=5,
            min_nnz=10,
            min_count=0,
            ignore_diags=1,
            tol=1e-5,
            max_iters=200):
    """
    Cooler contact matrix balancing.
    
    Parameters
    ----------
    cool_uri : str
        URI of cooler group.
    nproc : int
        Number of processes. (Default: 1)
        
    """
    cool_path, group_path = parse_cooler_uri(cool_uri)
    # pre-check the weight column
    with h5py.File(cool_path, 'r') as h5:
        grp = h5[group_path]
        if 'weight' in grp['bins']:
            del grp['bins']['weight']  # Overwrite the weight column

    log.info('Balancing {0}'.format(cool_uri))

    clr = Cooler(cool_uri)

    try:
        if nproc > 1:
            pool = Pool(nproc)
            map_ = pool.imap_unordered
        else:
            map_ = map

        if clr.info['metadata']['onlyIntra'] == 'True':
            onlyIntra = True
        else:
            onlyIntra = False

        bias, stats = ice.iterative_correction(clr,
                                               chunksize=chunksize,
                                               cis_only=onlyIntra,
                                               trans_only=False,
                                               tol=tol,
                                               min_nnz=min_nnz,
                                               min_count=min_count,
                                               blacklist=None,
                                               mad_max=mad_max,
                                               max_iters=max_iters,
                                               ignore_diags=ignore_diags,
                                               rescale_marginals=True,
                                               use_lock=False,
                                               map=map_)
    finally:
        if nproc > 1:
            pool.close()

    if not stats['converged']:
        log.error('Iteration limit reached without convergence')
        log.error('Storing final result. Check log to assess convergence.')

    with h5py.File(cool_path, 'r+') as h5:
        grp = h5[group_path]
        # add the bias column to the file
        h5opts = dict(compression='gzip', compression_opts=6)
        grp['bins'].create_dataset('weight', data=bias, **h5opts)
        grp['bins']['weight'].attrs.update(stats)
Exemple #11
0
def create_from_unordered(cool_uri, bins, chunks, columns=None, dtypes=None, mergebuf=int(20e6),
                         delete_temp=True, temp_dir=None, **kwargs):
    """
    Create a Cooler in two passes via an external sort mechanism. In the first 
    pass, a sequence of data chunks are processed and sorted in memory and saved
    to temporary Coolers. In the second pass, the temporary Coolers are merged 
    into the output. This way the individual chunks do not need to be provided
    in any particular order.
    
    Parameters
    ----------
    cool_uri : str
        Path to Cooler file or URI to Cooler group. If the file does not exist,
        it will be created.
    bins : DataFrame
        Segmentation of the chromosomes into genomic bins. May contain 
        additional columns.
    chunks : iterable of DataFrames
        Sequence of chunks that get processed and written to separate Coolers 
        and then subsequently merged.
    columns : sequence of str, optional
        Specify here the names of any additional value columns from the input 
        besides 'count' to store in the Cooler. The standard columns ['bin1_id', 
        'bin2_id', 'count'] can be provided, but are already assumed and don't 
        need to be given explicitly. Additional value columns provided here will 
        be stored as np.float64 unless otherwised specified using `dtype`.
    dtypes : dict, optional
        Dictionary mapping column names to dtypes. Can be used to override the
        default dtypes of ``bin1_id``, ``bin2_id`` or ``count`` or assign
        dtypes to custom value columns. Non-standard value columns given in
        ``dtypes`` must also be provided in the ``columns`` argument or they
        will be ignored.
    assembly : str, optional
        Name of genome assembly.
    mode : {'w' , 'a'}, optional [default: 'w']
        Write mode for the output file. 'a': if the output file exists, append
        the new cooler to it. 'w': if the output file exists, it will be
        truncated. Default is 'w'.
    metadata : dict, optional
        Experiment metadata to store in the file. Must be JSON compatible.
    mergebuf : int, optional
        Maximum number of records to buffer in memory at any give time during 
        the merge step.
    delete_temp : bool, optional
        Whether to delete temporary files when finished. 
        Useful for debugging. Default is False.
    temp_dir : str, optional
        Create temporary files in this directory.

    See also
    --------
    sanitize_records
    sanitize_pixels

    """
    bins = bins.copy()
    bins['chrom'] = bins['chrom'].astype(object)

    tf = tempfile.NamedTemporaryFile(
                suffix='.multi.cool', 
                delete=delete_temp,
                dir=temp_dir)
        
    uris = []
    for i, chunk in enumerate(chunks):
        uri = tf.name + '::' + str(i)
        uris.append(uri)
        log.info('Writing chunk {}: {}'.format(i, uri))
        create_cooler(uri, bins, chunk, columns=columns, mode='a', boundscheck=False,
                      triucheck=False, dupcheck=False, ensure_sorted=False, ordered=True,
                      dtypes=dtypes)
        
    chunks = CoolerMerger([Cooler(uri) for uri in uris], mergebuf)

    log.info('Merging into {}'.format(cool_uri))
    create_cooler(cool_uri, bins, chunks, columns=columns, dtypes=dtypes, ordered=True,
                  **kwargs)