Exemple #1
0
def copy_vnc_subvolume(box_zyx,
                       copy_grayscale=True,
                       copy_segmentation=True,
                       chunk_shape=(64, 64, 2048)):
    assert not (box_zyx % 64).any(), \
        "Only 64px block-aligned volumes can be copied."

    import numpy as np
    from neuclease.util import boxes_from_grid, tqdm_proxy, round_box
    from neuclease.dvid import find_master, fetch_raw, post_raw, fetch_subvol, post_labelmap_voxels

    vnc_master = ('emdata4:8200', find_master('emdata4:8200'))

    NUM_SCALES = 8
    num_voxels = np.prod(box_zyx[1] - box_zyx[0])

    if copy_grayscale:
        logger.info(
            f"Copying grayscale from box {box_zyx[:,::-1].tolist()} ({num_voxels/1e6:.1f} Mvox) for {NUM_SCALES} scales"
        )
        for scale in tqdm_proxy(range(NUM_SCALES)):
            if scale == 0:
                input_name = 'grayscalejpeg'
                output_name = 'local-grayscalejpeg'
            else:
                input_name = f'grayscalejpeg_{scale}'
                output_name = f'local-grayscalejpeg_{scale}'

            scaled_box_zyx = np.maximum(box_zyx // 2**scale, 1)
            scaled_box_zyx = round_box(scaled_box_zyx, 64, 'out')

            for chunk_box in tqdm_proxy(boxes_from_grid(scaled_box_zyx,
                                                        chunk_shape,
                                                        clipped=True),
                                        leave=False):
                chunk = fetch_subvol(*vnc_master,
                                     input_name,
                                     chunk_box,
                                     progress=False)
                post_raw(*vnc_master, output_name, chunk_box[0], chunk)

    if copy_segmentation:
        logger.info(
            f"Copying segmentation from box {box_zyx[:,::-1].tolist()} ({num_voxels/1e6:.2f} Mvox)"
        )
        for chunk_box in tqdm_proxy(
                boxes_from_grid(box_zyx, chunk_shape, clipped=True)):
            chunk = fetch_raw(*vnc_master,
                              'segmentation',
                              chunk_box,
                              dtype=np.uint64)
            post_labelmap_voxels(*vnc_master,
                                 'local-segmentation',
                                 chunk_box[0],
                                 chunk,
                                 downres=True)

        # TODO: Update label indexes?

    logger.info("DONE")
Exemple #2
0
def main():
    # Hard-coded parameters
    prod = 'emdata4:8900'
    master = (prod, find_master(prod))
    master_seg = (*master, 'segmentation')

    # I accidentally corrupted the labelindex of bodies in this region
    patch_box = 20480 + np.array([[0, 0, 0], [1024, 1024, 1024]])

    with Timer("Fetching supervoxels", logger):
        boxes = boxes_from_grid(patch_box, Grid((64, 64, 6400)), clipped=True)
        sv_sets = compute_parallel(partial(_fetch_svs, master_seg),
                                   boxes,
                                   processes=32,
                                   ordered=False,
                                   leave_progress=True)
        svs = set(chain(*sv_sets)) - set([0])

    bodies = set(fetch_mapping(*master_seg, svs))

    with Timer(f"Repairing {len(bodies)} labelindexes", logger):
        compute_parallel(partial(_repair_index, master_seg),
                         bodies,
                         processes=32,
                         ordered=False,
                         leave_progress=True)

    print("DONE.")
Exemple #3
0
def main():
    # Create the destination instance if necessary.
    dst_instances = fetch_repo_instances(*dst_node, 'annotation')
    if dst_syn not in dst_instances:
        logger.info(f"Creating instance '{dst_syn}'")
        create_instance(*dst_node, dst_syn, 'annotation')

    # Check to see if the sync already exists; add it if necessary
    syn_info = fetch_instance_info(*dst_node, dst_syn)
    if len(syn_info["Base"]["Syncs"]) == 0:
        logger.info(f"Adding a sync to '{dst_syn}' from '{dst_seg}'")
        post_sync(*dst_node, dst_syn, [dst_seg])
    elif syn_info["Base"]["Syncs"][0] != dst_seg:
        other_seg = syn_info["Base"]["Syncs"][0]
        raise RuntimeError(
            f"Can't create a sync to '{dst_seg}'. "
            f"Your instance is already sync'd to a different segmentation: {other_seg}"
        )

    # Fetch segmentation extents
    bounding_box_zyx = fetch_volume_box(*src_node, src_seg).tolist()

    # Break into block-aligned chunks (boxes) that are long in the X direction
    # (optimal access pattern for dvid read/write)
    boxes = boxes_from_grid(bounding_box_zyx, (256, 256, 6400), clipped=True)

    # Use a process pool to copy the chunks in parallel.
    compute_parallel(copy_syn_blocks,
                     boxes,
                     processes=PROCESSES,
                     ordered=False)
Exemple #4
0
def test_boxes_from_grid_0():
    # Simple: bounding_box starts at zero, no offset
    grid = Grid( (10,20), (0,0) )
    bounding_box = [(0,0), (100,300)]
    boxes = np.array(list(boxes_from_grid(bounding_box, grid)))
    assert boxes.shape == (np.prod( np.array(bounding_box[1]) / grid.block_shape ), 2, 2)
    assert (boxes % grid.block_shape == 0).all()
    assert (boxes[:, 1, :] - boxes[:, 0, :] == grid.block_shape).all()
    def init_boxes(self, volume_service, subset_labels, roi):
        sbm = None
        if roi:
            base_service = volume_service.base_service
            assert isinstance(base_service, DvidVolumeService), \
                "Can't specify an ROI unless you're using a dvid input"

            assert isinstance(volume_service, (ScaledVolumeService, DvidVolumeService)), \
                "The 'roi' option doesn't support adapters other than 'rescale-level'"
            scale = 0
            if isinstance(volume_service, ScaledVolumeService):
                scale = volume_service.scale_delta
                assert scale <= 5, \
                    "The 'roi' option doesn't support volumes downscaled beyond level 5"

            server, uuid, _seg_instance = base_service.instance_triple

            brick_shape = volume_service.preferred_message_shape
            assert not (brick_shape % 2**(5-scale)).any(), \
                "If using an ROI, select a brick shape that is divisible by 32"

            seg_box = volume_service.bounding_box_zyx
            seg_box = round_box(seg_box, brick_shape)
            seg_box_s0 = seg_box * 2**scale
            seg_box_s5 = seg_box // 2**(5 - scale)

            with Timer(
                    f"Fetching mask for ROI '{roi}' ({seg_box_s0[:, ::-1].tolist()})",
                    logger):
                roi_mask_s5, _ = fetch_roi(server,
                                           uuid,
                                           roi,
                                           format='mask',
                                           mask_box=seg_box_s5)

            # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0.
            sbm = SparseBlockMask.create_from_highres_mask(
                roi_mask_s5, 2**(5 - scale), seg_box, brick_shape)
        elif subset_labels:
            try:
                sbm = volume_service.sparse_block_mask_for_labels(
                    [*subset_labels])
                if ((sbm.box[1] - sbm.box[0]) == 0).any():
                    raise RuntimeError(
                        "Could not find sparse masks for any of the subset-labels"
                    )
            except NotImplementedError:
                sbm = None

        if sbm is None:
            boxes = boxes_from_grid(volume_service.bounding_box_zyx,
                                    volume_service.preferred_message_shape,
                                    clipped=True)
            return np.array([*boxes])
        else:
            return sbm.sparse_boxes(brick_shape)
Exemple #6
0
    def init_boxes(self, volume_service, roi):
        if not roi["name"]:
            boxes = boxes_from_grid(volume_service.bounding_box_zyx,
                                    volume_service.preferred_message_shape,
                                    clipped=True)
            return np.array([*boxes])

        base_service = volume_service.base_service

        if not roi["server"] or not roi["uuid"]:
            assert isinstance(base_service, DvidVolumeService), \
                "Since you aren't using a DVID input source, you must specify the ROI server and uuid."

        roi["server"] = (roi["server"] or volume_service.server)
        roi["uuid"] = (roi["uuid"] or volume_service.uuid)

        if roi["scale"] is not None:
            scale = roi["scale"]
        elif isinstance(volume_service, ScaledVolumeService):
            scale = volume_service.scale_delta
            assert scale <= 5, \
                "The 'roi' option doesn't support volumes downscaled beyond level 5"
        else:
            scale = 0

        brick_shape = volume_service.preferred_message_shape
        assert not (brick_shape % 2**(5-scale)).any(), \
            "If using an ROI, select a brick shape that is divisible by 32"

        seg_box = volume_service.bounding_box_zyx
        seg_box = round_box(seg_box, 2**(5 - scale))
        seg_box_s0 = seg_box * 2**scale
        seg_box_s5 = seg_box // 2**(5 - scale)

        with Timer(
                f"Fetching mask for ROI '{roi['name']}' ({seg_box_s0[:, ::-1].tolist()})",
                logger):
            roi_mask_s5, _ = fetch_roi(roi["server"],
                                       roi["uuid"],
                                       roi["name"],
                                       format='mask',
                                       mask_box=seg_box_s5)

        # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0.
        sbm = SparseBlockMask(roi_mask_s5, seg_box, 2**(5 - scale))
        boxes = sbm.sparse_boxes(brick_shape)

        # Clip boxes to the true (not rounded) bounding box
        boxes[:, 0] = np.maximum(boxes[:, 0],
                                 volume_service.bounding_box_zyx[0])
        boxes[:, 1] = np.minimum(boxes[:, 1],
                                 volume_service.bounding_box_zyx[1])
        return boxes
def split_brick(new_grid, original_brick):
    """
    Given a single brick and a new grid to which its data should be redistributed,
    split the brick into pieces, indexed by their NEW grid locations.
    
    The brick fragments are returned as Bricks themselves, but with relatively
    small volume and physical_box members.
    
    Note: It is probably a mistake to call this function for Bricks which have
          a larger physical_box than logical_box, so that is currently forbidden.
          (It would work here, but it implies that you will end up with some voxels
          represented multiple times in a given RDD of Bricks, with undefined results
          as to which ones are kept after you consolidate them into a new alignment.
          
          However, the reverse is permitted, i.e. it is permitted for the DESTINATION
          grid to use a halo, in which case some pixels in the original brick will be
          duplicated to multiple destinations.
    
    Returns: [(box,Brick), (box, Brick), ....],
            where each Brick is a fragment (to be assembled later into the new grid's bricks),
            and 'box' is the logical_box of the Brick into which this fragment should be assembled.
    """
    new_logical_boxes_and_fragments = []
    
    # Forbid out-of-bounds physical_boxes. (See note above.)
    assert ((original_brick.physical_box[0] >= original_brick.logical_box[0]).all() and
            (original_brick.physical_box[1] <= original_brick.logical_box[1]).all())
    
    # Iterate over the new boxes that intersect with the original brick
    for destination_box in boxes_from_grid(original_brick.physical_box, new_grid, include_halos=True):
        # Physical intersection of original with new
        split_box = box_intersection(destination_box, original_brick.physical_box)
        
        # Extract portion of original volume data that belongs to this new box
        split_box_internal = split_box - original_brick.physical_box[0]
        fragment_vol = extract_subvol(original_brick.volume, split_box_internal)

        # Subtract out halo to get logical_box
        new_logical_box = destination_box - (-new_grid.halo_shape, new_grid.halo_shape)

        fragment_brick = Brick(new_logical_box, split_box, fragment_vol)
        fragment_brick.compress()

        # Append key (the new_logical_box, but with a special type and hash,
        # to avoid bad collisions with the default spark hash function),
        # and new brick fragment, to be assembled into the final brick in a later stage.
        key = rt.tuple_with_hash( box_as_tuple(new_logical_box) )
        key.set_hash( hash(tuple(new_logical_box[0] / new_grid.block_shape)) )
        new_logical_boxes_and_fragments.append( (key, fragment_brick) )

    return new_logical_boxes_and_fragments
Exemple #8
0
def test_boxes_from_grid_1():
    # Set a non-aligned bounding box
    grid = Grid( (10,20), (0,0) )
    bounding_box = np.array([(15,30), (95,290)])
    
    aligned_bounding_box = (  bounding_box[0]                          // grid.block_shape * grid.block_shape,
                             (bounding_box[1] + grid.block_shape - 1 ) // grid.block_shape * grid.block_shape )
    
    algined_bb_shape = aligned_bounding_box[1] - aligned_bounding_box[0]
    
    boxes = np.array(list(boxes_from_grid(bounding_box, grid)))
    assert boxes.shape == (np.prod( algined_bb_shape / grid.block_shape ), 2, 2)
    assert (boxes % grid.block_shape == 0).all()
    assert (boxes[:, 1, :] - boxes[:, 0, :] == grid.block_shape).all()
Exemple #9
0
    def init_boxes(self, volume_service, roi):
        if not roi:
            boxes = boxes_from_grid(volume_service.bounding_box_zyx,
                                    volume_service.preferred_message_shape,
                                    clipped=True)
            return np.array([*boxes])

        base_service = volume_service.base_service
        assert isinstance(base_service, DvidVolumeService), \
            "Can't specify an ROI unless you're using a dvid input"

        assert isinstance(volume_service, (ScaledVolumeService, DvidVolumeService)), \
            "The 'roi' option doesn't support adapters other than 'rescale-level'"
        scale = 0
        if isinstance(volume_service, ScaledVolumeService):
            scale = volume_service.scale_delta
            assert scale <= 5, \
                "The 'roi' option doesn't support volumes downscaled beyond level 5"

        server, uuid, _seg_instance = base_service.instance_triple

        brick_shape = volume_service.preferred_message_shape
        assert not (brick_shape % 2**(5-scale)).any(), \
            "If using an ROI, select a brick shape that is divisible by 32"

        seg_box = volume_service.bounding_box_zyx
        seg_box = round_box(seg_box, 2**(5 - scale))
        seg_box_s0 = seg_box * 2**scale
        seg_box_s5 = seg_box // 2**(5 - scale)

        with Timer(
                f"Fetching mask for ROI '{roi}' ({seg_box_s0[:, ::-1].tolist()})",
                logger):
            roi_mask_s5, _ = fetch_roi(server,
                                       uuid,
                                       roi,
                                       format='mask',
                                       mask_box=seg_box_s5)

        # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0.
        sbm = SparseBlockMask(roi_mask_s5, seg_box, 2**(5 - scale))
        boxes = sbm.sparse_boxes(brick_shape)

        # Clip boxes to the true (not rounded) bounding box
        boxes[:, 0] = np.maximum(boxes[:, 0],
                                 volume_service.bounding_box_zyx[0])
        boxes[:, 1] = np.minimum(boxes[:, 1],
                                 volume_service.bounding_box_zyx[1])
        return boxes
Exemple #10
0
def download(bounding_box_zyx, output_path):
    shape = bounding_box_zyx[1] - bounding_box_zyx[0]

    with h5py.File(output_path, 'w') as f:
        gray_dset = f.create_dataset('grayscale',
                                     shape=shape,
                                     dtype=np.uint8,
                                     chunks=True)
        seg_dset = f.create_dataset('segmentation',
                                    shape=shape,
                                    dtype=np.uint64,
                                    chunks=True,
                                    compression='gzip')

        print("Downloading grayscale...")
        block_shape = (256, 256, 256)
        block_boxes = boxes_from_grid(bounding_box_zyx,
                                      block_shape,
                                      clipped=True)
        for block_box in tqdm(block_boxes):
            relative_box = block_box - bounding_box_zyx[0]
            block_gray = fetch_raw(*GRAYSCALE, block_box)
            overwrite_subvol(gray_dset, relative_box, block_gray)

        print("")
        print("Downloading segmentation...")
        block_boxes = boxes_from_grid(bounding_box_zyx,
                                      block_shape,
                                      clipped=True)
        for block_box in tqdm(block_boxes):
            relative_box = block_box - bounding_box_zyx[0]
            block_seg = fetch_labelmap_voxels(*SEGMENTATION, block_box)
            overwrite_subvol(seg_dset, relative_box, block_seg)

    print("")
    print("DONE")
Exemple #11
0
def test_boxes_from_grid_2():
    # Use a grid offset
    grid = Grid( (10,20), (2,3) )
    bounding_box = np.array([(5,10), (95,290)])
    
    aligned_bounding_box = (  bounding_box[0]                          // grid.block_shape * grid.block_shape,
                             (bounding_box[1] + grid.block_shape - 1 ) // grid.block_shape * grid.block_shape )
    
    aligned_bb_shape = aligned_bounding_box[1] - aligned_bounding_box[0]
    
    boxes = np.array(list(boxes_from_grid(bounding_box, grid)))
    assert boxes.shape == (np.prod( aligned_bb_shape / grid.block_shape ), 2, 2)
    
    # Boxes should be offset by grid.offset.
    assert ((boxes - grid.offset) % grid.block_shape == 0).all()
    assert (boxes[:, 1, :] - boxes[:, 0, :] == grid.block_shape).all()
    def _execute_scale(self, scale, starting_batch, mask_s5, mask_box_s5):
        options = self.config["masksegmentation"]
        block_width = self.output_service.block_width

        def scale_box(box, scale):
            # Scale down, then round up to the nearest multiple of the block width
            box = np.ceil(box / 2**scale).astype(np.int32)
            return round_box(box, block_width)

        # bounding box of the segmentation at the current scale.
        bounding_box = scale_box(self.input_service.bounding_box_zyx, scale)

        # Don't make bricks that are wider than the bounding box at this scale
        brick_shape = np.minimum(self.input_service.preferred_message_shape,
                                 bounding_box[1])
        assert not (brick_shape % block_width).any()

        brick_boxes = boxes_from_grid(bounding_box, brick_shape, clipped=True)

        with Timer(f"Scale {scale}: Preparing bricks", logger):
            boxes_and_masks = []
            for box in brick_boxes:
                mask_block_box = ((box // 2**(5 - scale)) - mask_box_s5[0])
                mask_block_box = mask_block_box.astype(
                    np.int32)  # necessary when scale is > 5
                mask_block_s5 = np.zeros(box_shape(mask_block_box), bool)
                mask_block_s5 = extract_subvol(mask_s5, mask_block_box)
                if mask_block_s5.any():
                    boxes_and_masks.append((box, mask_block_s5))

        batches = [*iter_batches(boxes_and_masks, options["batch-size"])]

        if starting_batch == 0:
            logger.info(f"Scale {scale}: Processing {len(batches)} batches")
        else:
            logger.info(
                f"Scale {scale}: Processing {len(batches) - starting_batch} "
                f"remaining batches from {len(batches)} original batches")

            assert starting_batch < len(batches), \
                f"Can't start at batch {starting_batch}; there are only {len(batches)} in total."
            batches = batches[starting_batch:]

        for batch_index, batch_boxes_and_masks in enumerate(
                batches, start=starting_batch):
            with Timer(f"Scale {scale}: Batch {batch_index:02d}", logger):
                self._execute_batch(scale, batch_index, batch_boxes_and_masks)
Exemple #13
0
    def init_boxes(self, volume_service, roi, chunk_shape_s0):
        """
        Return a set of bounding boxes to tile the given ROI.
        Scale 0 of the volume service should correspond to full-res data,
        which is 32x higher-res than ROI resolution.
        """
        if not roi["name"]:
            boxes = boxes_from_grid(volume_service.bounding_box_zyx,
                                    chunk_shape_s0,
                                    clipped=True)
            return np.array([*boxes])

        base_service = volume_service.base_service

        if not roi["server"] or not roi["uuid"]:
            assert isinstance(base_service, DvidVolumeService), \
                "Since you aren't using a DVID input source, you must specify the ROI server and uuid."

        roi["server"] = (roi["server"] or volume_service.server)
        roi["uuid"] = (roi["uuid"] or volume_service.uuid)

        assert not (chunk_shape_s0 % 2**5).any(), \
            "If using an ROI, select a chunk shape that is divisible by 32"

        seg_box_s0 = volume_service.bounding_box_zyx
        seg_box_s0 = round_box(seg_box_s0, 2**5)
        seg_box_s5 = seg_box_s0 // 2**5

        with Timer(
                f"Fetching mask for ROI '{roi['name']}' ({seg_box_s0[:, ::-1].tolist()})",
                logger):
            roi_mask_s5, _ = fetch_roi(roi["server"],
                                       roi["uuid"],
                                       roi["name"],
                                       format='mask',
                                       mask_box=seg_box_s5)

        # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0.
        sbm = SparseBlockMask(roi_mask_s5, seg_box_s0, 2**5)
        boxes = sbm.sparse_boxes(chunk_shape_s0)

        # Clip boxes to the true (not rounded) bounding box
        boxes[:, 0] = np.maximum(boxes[:, 0],
                                 volume_service.bounding_box_zyx[0])
        boxes[:, 1] = np.minimum(boxes[:, 1],
                                 volume_service.bounding_box_zyx[1])
        return boxes
Exemple #14
0
 def get_subvolume(self, box, scale=0):
     req_bytes = 8 * np.prod(box[1] - box[0])
     with self._resource_manager_client.access_context(
             'brainmaps', True, 1, req_bytes):
         if not self._fetch_blockwise:
             return self._brainmaps_client.get_subvolume(box, scale)
         else:
             block_shape = 3 * (self._block_width, )
             subvol = np.zeros(box[1] - box[0], self.dtype)
             for block_box in boxes_from_grid(box,
                                              block_shape,
                                              clipped=True):
                 block = self._brainmaps_client.get_subvolume(
                     block_box, scale)
                 outbox = block_box - box[0]
                 subvol[box_to_slicing(*outbox)] = block
             return subvol
        def timed_fetch_blocks_from_box(box):
            """
            Fetch the blocks for a given box and return the time it took to fetch them.
            Do not bother decompressing the blocks or combining them into a single volume.
            """
            assert not (box % block_shape).any(), "For this test, all requests must be block-aligned"
            block_boxes = list( boxes_from_grid(box, Grid(block_shape)) )
            block_coords_xyz = np.array(block_boxes)[:,0,::-1] // block_shape
            block_coords_str = ','.join(map(str, block_coords_xyz.flat))

            voxel_count = np.prod(box[1] - box[0])

            session = default_dvid_session()
            url = f'{server}/api/node/{uuid}/{instance}/specificblocks?blocks={block_coords_str}'
            
            with resource_mgr_client.access_context(server, True, 1, voxel_count):
                timestamp = datetime.now()
                with Timer() as timer:
                    r = session.get(url)
            
            r.raise_for_status()
            return timestamp, voxel_count, len(r.content), timer.seconds
def copy_synapses(src_loc, dst_loc, processes):
    """
    See caveats in the module docstring above.
    """
    src_loc = Location(*src_loc)
    dst_loc = Location(*dst_loc)

    # Create the destination instance if necessary.
    dst_instances = fetch_repo_instances(*dst_loc[:2], 'annotation')
    if dst_loc.syn_instance not in dst_instances:
        logger.info(f"Creating instance '{dst_loc.syn_instance}'")
        create_instance(*dst_loc, 'annotation')

    # Check to see if the sync already exists; add it if necessary
    syn_info = fetch_instance_info(*dst_loc[:3])
    if len(syn_info["Base"]["Syncs"]) == 0:
        logger.info(
            f"Adding a sync to '{dst_loc.syn_instance}' from '{dst_loc.seg_instance}'"
        )
        post_sync(*dst_loc[:3], [dst_loc.seg_instance])
    elif syn_info["Base"]["Syncs"][0] != dst_loc.seg_instance:
        other_seg = syn_info["Base"]["Syncs"][0]
        raise RuntimeError(
            f"Can't create a sync to '{dst_loc.seg_instance}'. "
            f"Your instance is already sync'd to a different segmentation: {other_seg}"
        )

    # Fetch segmentation extents
    bounding_box_zyx = fetch_volume_box(*src_loc[:2],
                                        src_loc.seg_instance).tolist()

    # Break into block-aligned chunks (boxes) that are long in the X direction
    # (optimal access pattern for dvid read/write)
    boxes = boxes_from_grid(bounding_box_zyx, (256, 256, 6400), clipped=True)

    # Use a process pool to copy the chunks in parallel.
    fn = partial(copy_syn_blocks, src_loc, dst_loc)
    compute_parallel(fn, boxes, processes=processes, ordered=False)
def block_stats_from_brick(block_shape, brick):
    """
    Get the count of voxels for each segment (excluding segment 0)
    in each block within the given brick, returned as a DataFrame.
    
    Returns a DataFrame with the following columns:
        ['segment_id', 'z', 'y', 'x', 'count']
        where z,y,z are the starting coordinates of each block.
    """
    block_grid = Grid(block_shape)
    
    block_dfs = []
    block_boxes = boxes_from_grid(brick.physical_box, block_grid)
    for box in block_boxes:
        clipped_box = box_intersection(box, brick.physical_box) - brick.physical_box[0]
        block_vol = brick.volume[box_to_slicing(*clipped_box)]
        counts = pd.Series(block_vol.reshape(-1)).value_counts(sort=False)
        segment_ids = counts.index.values
        counts = counts.values.astype(np.uint32)

        box = box.astype(np.int32)

        block_df = pd.DataFrame( { 'segment_id': segment_ids,
                                   'count': counts,
                                   'z': box[0][0],
                                   'y': box[0][1],
                                   'x': box[0][2] } )

        # Exclude segment 0 from output        
        block_df = block_df[block_df['segment_id'] != 0]

        block_dfs.append(block_df)

    brick_df = pd.concat(block_dfs, ignore_index=True)
    brick_df = brick_df[['segment_id', 'z', 'y', 'x', 'count']]
    assert list(brick_df.columns) == list(BLOCK_STATS_DTYPES.keys())
    return brick_df
    def init_boxes(self, volume_service, roi):
        if not roi["name"]:
            boxes = boxes_from_grid(volume_service.bounding_box_zyx,
                                    volume_service.preferred_message_shape,
                                    clipped=True)
            return np.array([*boxes])

        server, uuid, roi_name = roi["server"], roi["uuid"], roi["name"]
        roi_scale = roi["relative-scale"]

        brick_shape = volume_service.preferred_message_shape
        assert not (brick_shape % 2**roi_scale).any(), \
            "If using an ROI, select a brick shape that is divisible by 32"

        seg_box = volume_service.bounding_box_zyx
        seg_box = round_box(seg_box, 2**roi_scale)
        seg_box_s5 = seg_box // 2**roi_scale

        with Timer(
                f"Fetching mask for ROI '{roi_name}' ({seg_box[:, ::-1].tolist()})",
                logger):
            roi_mask_s5, _ = fetch_roi(server,
                                       uuid,
                                       roi_name,
                                       format='mask',
                                       mask_box=seg_box_s5)

        # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0.
        sbm = SparseBlockMask(roi_mask_s5, seg_box, 2**roi_scale)
        boxes = sbm.sparse_boxes(brick_shape)

        # Clip boxes to the true (not rounded) bounding box
        boxes[:, 0] = np.maximum(boxes[:, 0],
                                 volume_service.bounding_box_zyx[0])
        boxes[:, 1] = np.minimum(boxes[:, 1],
                                 volume_service.bounding_box_zyx[1])
        return boxes
        def timed_fetch_blocks_from_box(box):
            """
            Fetch the blocks for a given box and return the time it took to fetch them.
            Do not bother decompressing the blocks or combining them into a single volume.
            """
            assert not (box % block_shape).any(
            ), "For this test, all requests must be block-aligned"
            block_boxes = list(boxes_from_grid(box, Grid(block_shape)))
            block_coords_xyz = np.array(block_boxes)[:, 0, ::-1] // block_shape
            block_coords_str = ','.join(map(str, block_coords_xyz.flat))

            voxel_count = np.prod(box[1] - box[0])

            session = default_dvid_session()
            url = f'{server}/api/node/{uuid}/{instance}/specificblocks?blocks={block_coords_str}'

            with resource_mgr_client.access_context(server, True, 1,
                                                    voxel_count):
                timestamp = datetime.now()
                with Timer() as timer:
                    r = session.get(url)

            r.raise_for_status()
            return timestamp, voxel_count, len(r.content), timer.seconds
def test_copygrayscale_from_hdf5_to_n5(disable_auto_retry):
    template_dir = tempfile.mkdtemp(suffix="copygrayscale-hdf5-to-n5-template")

    SHAPE = (250, 240, 230)

    # Create volume, write to HDF5
    volume = np.random.randint(10, size=SHAPE, dtype=np.uint8)
    volume_path = f"{template_dir}/volume.h5"
    with h5py.File(volume_path, 'w') as f:
        f['volume'] = volume

    config_text = textwrap.dedent(f"""\
        workflow-name: copygrayscale
        cluster-type: {CLUSTER_TYPE}
        
        input:
          hdf5:
            path: {volume_path}
            dataset: volume
          
          geometry:
            message-block-shape: [64,64,256]
            bounding-box: [[0,0,0], {[*SHAPE[::-1]]}]

          adapters:
            # Enable multi-scale, since otherwise
            # Hdf5VolumeService doesn't support it out-of-the box
            rescale-level: 0

        output:
          n5:
            path: output.n5
            dataset: s0
            create-if-necessary: true
            creation-settings:
              dtype: uint8
              # max-scale: 2 # Should be set automatically from max-pyramid-scale

          geometry:
            message-block-shape: [256,128,128]
            available-scales: [0,1,2]
        
        copygrayscale:
          max-pyramid-scale: 2
          slab-depth: 128
          fill-blocks: false  # N5 is block-based, but does not require (or allow) us to pad the boundary blocks.
    """)

    with open(f"{template_dir}/workflow.yaml", 'w') as f:
        f.write(config_text)

    # Run
    _execution_dir, workflow = launch_flow(template_dir, 1)
    final_config = workflow.config

    input_service = VolumeService.create_from_config(final_config['input'])
    output_service = VolumeService.create_from_config(final_config['output'])

    # Check results -- must use half-brick-aligned checks to ensure that the downsampling is the same.
    # And don't check scale 2 -- there is too much difference between downsampled by 4 vs downsampled-by-2-twice
    for scale in range(2):
        scaled_box = output_service.bounding_box_zyx // 2**scale
        brick_shape = output_service.preferred_message_shape // 2
        for brick_box in boxes_from_grid(scaled_box, brick_shape,
                                         clipped=True):
            expected_vol = input_service.get_subvolume(brick_box, scale)
            output_vol = output_service.get_subvolume(brick_box, scale)

            try:
                assert np.allclose(output_vol, expected_vol, 2, 5), \
                    f"Written vol does not match expected at scale {scale}"
            except:
                raise
Exemple #21
0
def load_roi_label_volume(server,
                          uuid,
                          rois_or_neuprint,
                          box_s5=[None, None],
                          export_path=None,
                          export_labelmap=None):
    """
    Fetch several ROIs from DVID and combine them into a single label volume or mask.
    The label values in the returned volume correspond to the order in which the ROI
    names were passed in, starting at label 1.
    
    This function is essentially a convenience function around fetch_combined_roi_volume(),
    but in this case it will optionally auto-fetch the ROI list, and auto-export the volume.
    
    Args:
        server:
            DVID server

        uuid:
            DVID uuid

        rois_or_neuprint:
            Either a list of ROIs or a neuprint server from which to obtain the roi list.

        box_s5:
            If you want to restrict the ROIs to a particular subregion,
            you may pass your own bounding box (at scale 5).
            Alternatively, you may pass the name of a segmentation
            instance from DVID whose bounding box will be used.

        export_path:
            If you want the ROI volume to be exported to disk,
            provide a path name ending with .npy or .h5.
        
        export_labelmap:
            If you want the ROI volume to be exported to a DVID labelmap instance,
            Provide the instance name, or a tuple of (server, uuid, instance).
    
    Returns:
        (roi_vol, roi_box), containing the fetched label volume and the
        bounding box it corresponds to, in DVID scale-5 coordinates.

    Note:
      If you have a list of (full-res) points to extract from the returned volume,
      pass a DataFrame with columns ['z','y','x'] to the following function.
      If you already downloaded the roi_vol (above), provide it.
      Otherwise, leave out those args and it will be fetched first.
      Adds columns to the input DF (in-place) for 'roi' (str) and 'roi_label' (int).
    
        >>> from neuclease.dvid import determine_point_rois
        >>> determine_point_rois(*master, rois, point_df, roi_vol, roi_box)
    """
    if isinstance(box_s5, str):
        # Assume that this is a segmentation instance whose dimensions should be used
        # Fetch the maximum extents of the segmentation,
        # and rescale it for scale-5.
        seg_box = fetch_volume_box(server, uuid, box_s5)
        box_s5 = round_box(seg_box, (2**5), 'out') // 2**5
        box_s5[0] = (0, 0, 0)

    if export_labelmap:
        assert isinstance(box_s5, np.ndarray)
        assert not (box_s5 % 64).any(), \
            ("If exporting to a labelmap instance, please supply "
             "an explicit box and make sure it is block-aligned.")

    if isinstance(rois_or_neuprint, (str, neuprint.Client)):
        if isinstance(rois_or_neuprint, str):
            npclient = neuprint.Client(rois_or_neuprint)
        else:
            npclient = rois_or_neuprint

        # Fetch ROI names from neuprint
        q = "MATCH (m: Meta) RETURN m.superLevelRois as rois"
        rois = npclient.fetch_custom(q)['rois'].iloc[0]
        rois = sorted(rois)
        # # Remove '.*ACA' ROIs. Apparently there is some
        # # problem with them. (They overlap with other ROIs.)
        # rois = [*filter(lambda r: 'ACA' not in r, rois)]
    else:
        assert isinstance(rois_or_neuprint, collections.abc.Iterable)
        rois = rois_or_neuprint

    # Fetch each ROI and write it into a volume
    with Timer(f"Fetching combined ROI volume for {len(rois)} ROIs", logger):
        roi_vol, roi_box, overlap_stats = fetch_combined_roi_volume(
            server, uuid, rois, box_zyx=box_s5)

    if len(overlap_stats) > 0:
        logger.warn(
            f"Some ROIs overlap! Here's an incomplete list of overlapping pairs:\n{overlap_stats}"
        )

    # Export to npy/h5py for external use
    if export_path:
        with Timer(f"Exporting to {export_path}", logger):
            if export_path.endswith('.npy'):
                np.save(export_path, roi_vol)
            elif export_path.endswith('.h5'):
                with h5py.File(export_path, 'w') as f:
                    f.create_dataset('rois_scale_5', data=roi_vol, chunks=True)

    if export_labelmap:
        if isinstance(export_labelmap, str):
            export_labelmap = (server, uuid, export_labelmap)

        assert len(export_labelmap) == 3
        with Timer(f"Exporting to {export_labelmap[2]}", logger):
            if export_labelmap[2] not in fetch_repo_instances(
                    server, uuid, 'labelmap'):
                create_labelmap_instance(
                    *export_labelmap, voxel_size=8 * (2**5),
                    max_scale=6)  # FIXME: hard-coded voxel size

            # It's really important to use this block shape.
            # See https://github.com/janelia-flyem/dvid/issues/342
            boxes = boxes_from_grid(roi_box, (256, 256, 256), clipped=True)
            for box in tqdm_proxy(boxes):
                block = extract_subvol(roi_vol, box - roi_box[0])
                post_labelmap_voxels(*export_labelmap,
                                     box[0],
                                     block,
                                     scale=0,
                                     downres=True)

    return roi_vol, roi_box, rois
def generate_bricks_from_volume_source( bounding_box, grid, volume_accessor_func, sc=None, rdd_partition_length=None, sparse_boxes=None, lazy=False ):
    """
    Generate an RDD or iterable of Bricks for the given bounding box and grid.
     
    Args:
        bounding_box:
            (start, stop)
 
        grid:
            Grid (see above)
 
        volume_accessor_func:
            Callable with signature: f(box) -> ndarray
            Note: The callable will be unpickled only once per partition, so initialization
                  costs after unpickling are only incurred once per partition.
 
        sc:
            SparkContext. If provided, an RDD is returned.  Otherwise, returns an ordinary Python iterable.
 
        rdd_partition_length:
            Optional. If provided, the RDD will have (approximately) this many bricks per partition.
        
        sparse_boxes:
            Optional.
            A pre-calculated list of boxes to use instead of instead of calculating
            the complete (dense) list of grid boxes within the bounding box.
            If provided, should be a list of physical boxes, and no two should occupy
            the same logical box, as defined by their midpoints.
            Note: They will still be clipped to the overall bounding_box.
        
        halo: An integer or shape indicating how much halo to add to each Brick's physical_box.
              The halo is applied in both 'dense' and 'sparse' cases.
    """
    if sparse_boxes is None:
        # Generate boxes from densely populated grid
        logical_boxes = boxes_from_grid(bounding_box, grid, include_halos=False)
        physical_boxes = clipped_boxes_from_grid(bounding_box, grid)
        logical_and_physical_boxes = zip( logical_boxes, physical_boxes )
    else:
        # User provided list of physical boxes.
        # Clip them to the bounding box and calculate the logical boxes.
        if not hasattr(sparse_boxes, '__len__'):
            sparse_boxes = list( sparse_boxes )
        physical_boxes = np.asarray( sparse_boxes )
        assert physical_boxes.ndim == 3 and physical_boxes.shape[1:3] == (2,3)

        def logical_and_clipped( box ):
            midpoint = (box[0] + box[1]) // 2
            logical_box = grid.compute_logical_box( midpoint )
            box += (-grid.halo_shape, grid.halo_shape)
            # Note: Non-intersecting boxes will have non-positive shape after clipping
            clipped_box = box_intersection(box, bounding_box)
            return ( logical_box, clipped_box )

        logical_and_physical_boxes = map(logical_and_clipped, physical_boxes)

        # Drop any boxes that fall completely outside the bounding box
        # Check that physical box doesn't completely fall outside its logical_box
        def is_valid(logical_and_physical):
            logical_box, physical_box = logical_and_physical
            return (physical_box[1] > logical_box[0]).all() and (physical_box[0] < logical_box[1]).all()
        logical_and_physical_boxes = filter(is_valid, logical_and_physical_boxes )

    if sc:
        if not hasattr(logical_and_physical_boxes, '__len__'):
            logical_and_physical_boxes = list(logical_and_physical_boxes) # need len()

        num_rdd_partitions = None
        if rdd_partition_length is not None:
            rdd_partition_length = max(1, rdd_partition_length)
            num_rdd_partitions = int( np.ceil( len(logical_and_physical_boxes) / rdd_partition_length ) )

        # If we're working with a tiny volume (e.g. testing),
        # make sure we at least parallelize across all cores.
        if num_rdd_partitions is not None and (num_rdd_partitions < cpus_per_worker() * num_worker_nodes()):
            num_rdd_partitions = cpus_per_worker() * num_worker_nodes()

        def brick_size(log_phys):
            _logical, physical = log_phys
            return np.uint64(np.prod(physical[1] - physical[0]))
        total_volume = sum(map(brick_size, logical_and_physical_boxes))
        logger.info(f"Initializing RDD of {len(logical_and_physical_boxes)} Bricks "
                    f"(over {num_rdd_partitions} partitions) with total volume {total_volume/1e9:.1f} Gvox")

        # Enumerate and repartition to get perfect partition sizes,
        # rather than relying on spark's default hash
        class _enumerated_value(tuple):
            # Return a hash based on the key alone.
            def __hash__(self):
                return self[0]

        enumerated_logical_and_physical_boxes = sc.parallelize( enumerate(logical_and_physical_boxes), num_rdd_partitions )
        enumerated_logical_and_physical_boxes = enumerated_logical_and_physical_boxes.map(_enumerated_value)
        enumerated_logical_and_physical_boxes = enumerated_logical_and_physical_boxes.partitionBy(num_rdd_partitions, lambda x: x)
        logical_and_physical_boxes = enumerated_logical_and_physical_boxes.values()

    def make_bricks( logical_and_physical_box ):
        logical_box, physical_box = logical_and_physical_box
        if lazy:
            return Brick(logical_box, physical_box, lazy_creation_fn=volume_accessor_func)
        else:
            volume = volume_accessor_func(physical_box)
            return Brick(logical_box, physical_box, volume)
    
    bricks = rt.map( make_bricks, logical_and_physical_boxes )
    return bricks
Exemple #23
0
    def init_boxes(self, volume_service, subset_labels, roi):
        sbm = None
        if roi:
            base_service = volume_service.base_service
            assert isinstance(base_service, DvidVolumeService), \
                "Can't specify an ROI unless you're using a dvid input"

            assert isinstance(volume_service, (ScaledVolumeService, DvidVolumeService)), \
                "The 'roi' option doesn't support adapters other than 'rescale-level'"
            scale = 0
            if isinstance(volume_service, ScaledVolumeService):
                scale = volume_service.scale_delta
                assert scale <= 5, \
                    "The 'roi' option doesn't support volumes downscaled beyond level 5"

            server, uuid, _seg_instance = base_service.instance_triple

            brick_shape = volume_service.preferred_message_shape
            assert not (brick_shape % 2**(5-scale)).any(), \
                "If using an ROI, select a brick shape that is divisible by 32"

            seg_box = volume_service.bounding_box_zyx
            seg_box = round_box(seg_box, brick_shape)
            seg_box_s5 = seg_box // 2**(5 - scale)

            with Timer(f"Fetching mask for ROI '{roi}'", logger):
                roi_mask_s5, roi_box_s5 = fetch_roi(server,
                                                    uuid,
                                                    roi,
                                                    format='mask')

            # Restrict to input bounding box
            clipped_roi_box_s5 = box_intersection(seg_box_s5, roi_box_s5)
            clipped_roi_mask_s5 = extract_subvol(
                roi_mask_s5, clipped_roi_box_s5 - roi_box_s5[0])

            # Align to brick grid
            aligned_roi_box_s5 = round_box(clipped_roi_box_s5,
                                           brick_shape // 2**5, 'out')
            padding = (aligned_roi_box_s5 - clipped_roi_box_s5)
            padding[0] *= -1
            aligned_roi_mask_s5 = np.pad(clipped_roi_mask_s5,
                                         padding.transpose())

            # At the service native scale
            aligned_roi_box = (2**(5 - scale) * aligned_roi_box_s5)
            logger.info(
                f"Brick-aligned ROI '{roi}' has bounding-box {aligned_roi_box[:, ::-1].tolist()}"
            )

            # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0.
            sbm = SparseBlockMask.create_from_highres_mask(
                aligned_roi_mask_s5, 2**(5 - scale), aligned_roi_box,
                brick_shape)
        elif subset_labels:
            try:
                sbm = volume_service.sparse_block_mask_for_labels(
                    [*subset_labels])
                if ((sbm.box[1] - sbm.box[0]) == 0).any():
                    raise RuntimeError(
                        "Could not find sparse masks for any of the subset-labels"
                    )
            except NotImplementedError:
                sbm = None

        if sbm is None:
            boxes = boxes_from_grid(volume_service.bounding_box_zyx,
                                    volume_service.preferred_message_shape,
                                    clipped=True)
            return np.array([*boxes])
        else:
            boxes = sbm.sparse_boxes(brick_shape)
            boxes = np.array(boxes)

            # Clip
            boxes[:, 0, :] = np.maximum(volume_service.bounding_box_zyx[0],
                                        boxes[:, 0, :])
            boxes[:, 1, :] = np.minimum(volume_service.bounding_box_zyx[1],
                                        boxes[:, 1, :])
            assert (boxes[:,0,:] < boxes[:,1,:]).all(), \
                "After cropping to input volume, some bricks disappeared."

            return boxes
    def _init_mask(self):
        """
        - read the mask ROI as a volume
        - dilate/erode it if necessary
        - invert it if necessary
        - save to .h5 (just for offline debug)
        - return the scale-5 mask and its scale-5 bounding-box
        """
        options = self.config["masksegmentation"]
        roi = options["mask-roi"]
        invert_mask = options["invert-mask"]
        max_scale = options["max-pyramid-scale"]
        roi_dilation = options["dilate-roi"]
        roi_erosion = options["erode-roi"]
        seg_dilation = options["dilate-segmentation"]

        block_width = self.output_service.block_width

        # Select a mask_box that's large enough to divide evenly into the
        # block width even when reduced to the highest scale we'll be processing.
        seg_box = round_box(self.input_service.bounding_box_zyx,
                            block_width * 2**max_scale)
        seg_box_s5 = round_box(seg_box, 2**5) // (2**5)

        with Timer(f"Loading ROI '{roi}'", logger):
            roi_mask, _ = fetch_roi(self.input_service.server,
                                    self.input_service.uuid,
                                    roi,
                                    format='mask',
                                    mask_box=seg_box_s5)

        with h5py.File('roi-mask.h5', 'w') as f:
            f.create_dataset('mask',
                             data=roi_mask.view(np.uint8),
                             chunks=(128, 128, 128))

        assert not (roi_dilation and roi_erosion)

        if roi_dilation > 0:
            with Timer(f"Dilating ROI by {roi_dilation}", logger):
                roi_mask = vigra.filters.multiBinaryDilation(
                    roi_mask, roi_dilation)
            with h5py.File('dilated-roi-mask.h5', 'w') as f:
                f.create_dataset('mask',
                                 data=roi_mask.view(np.uint8),
                                 chunks=(128, 128, 128))

        if roi_erosion > 0:
            with Timer(f"Eroding ROI by {roi_erosion}", logger):
                roi_mask = vigra.filters.multiBinaryErosion(
                    roi_mask, roi_erosion)
            with h5py.File('eroded-roi-mask.h5', 'w') as f:
                f.create_dataset('mask',
                                 data=roi_mask.view(np.uint8),
                                 chunks=(128, 128, 128))

        assert not seg_dilation or invert_mask, \
            "Can't use 'dilate-segmentation'. The segmentation isn't downloaded unless 'invert-mask' is used."

        if invert_mask:
            with Timer("Inverting mask", logger):
                # Initialize the mask with entire segmentation at scale 5,
                # then subtract the roi from it.
                boxes = [
                    *boxes_from_grid(seg_box_s5, (64, 64, 2048), clipped=True)
                ]

                input_service = self.input_service

                def fetch_seg_mask_s5(box_s5):
                    seg_s5 = input_service.get_subvolume(box_s5, scale=5)
                    return box_s5, (seg_s5 != 0)

                boxes_and_mask = dask.bag.from_sequence(
                    boxes, 1).map(fetch_seg_mask_s5).compute()

                seg_mask = np.zeros(box_shape(seg_box_s5), bool)
                for box_s5, box_mask in boxes_and_mask:
                    overwrite_subvol(seg_mask, box_s5, box_mask)

                if seg_dilation == 0:
                    with h5py.File('segmentation-mask.h5', 'w') as f:
                        f.create_dataset('mask',
                                         data=seg_mask.view(np.uint8),
                                         chunks=(128, 128, 128))
                else:
                    with Timer(f"Dilating segmentation by {seg_dilation}",
                               logger):
                        seg_mask = vigra.filters.multiBinaryDilation(
                            seg_mask, seg_dilation)

                    with h5py.File('dilated-segmentation-mask.h5', 'w') as f:
                        f.create_dataset('mask',
                                         data=seg_mask.view(np.uint8),
                                         chunks=(128, 128, 128))

                seg_mask[roi_mask] = False
                roi_mask = seg_mask

        with h5py.File('final-mask.h5', 'w') as f:
            f.create_dataset('mask',
                             data=roi_mask.view(np.uint8),
                             chunks=(128, 128, 128))

        # Downsample the roi_mask to dvid-block resolution, just to see how many blocks it touches.
        block_mask = view_as_blocks(roi_mask, (2, 2, 2)).any(axis=(3, 4, 5))
        blocks_touched = block_mask.sum()
        voxel_total = blocks_touched * (block_width**3)
        logger.info(
            f"Mask touches {blocks_touched} blocks ({voxel_total / 1e9:.1f} Gigavoxels)"
        )

        return roi_mask, seg_box_s5
Exemple #25
0
def split_brick(new_grid, original_brick):
    """
    Given a single brick and a new grid to which its data should be redistributed,
    split the brick into pieces, indexed by their NEW grid locations.

    The brick fragments are returned as Bricks themselves, but with relatively
    small volume and physical_box members.

    Note: It is probably a mistake to call this function for Bricks which have
          a larger physical_box than logical_box, so that is currently forbidden.
          (It would work here, but it implies that you will end up with some voxels
          represented multiple times in a given RDD of Bricks, with undefined results
          as to which ones are kept after you consolidate them into a new alignment.

          However, the reverse is permitted, i.e. it is permitted for the DESTINATION
          grid to use a halo, in which case some pixels in the original brick will be
          duplicated to multiple destinations.

    Returns: [Brick, Brick, ....],
            where each Brick is a fragment (to be assembled later into the new grid's bricks),
    """
    fragments = []

    # Forbid out-of-bounds physical_boxes. (See note above.)
    assert ((original_brick.physical_box[0] >= original_brick.logical_box[0]).all() and
            (original_brick.physical_box[1] <= original_brick.logical_box[1]).all()), \
                f"{original_brick.physical_box[:,::-1].tolist()} extends outside of {original_brick.logical_box[:,::-1].tolist()}"

    ## FIXME:
    ## If the brick lies completely within a single grid square for the destination block,
    ## Then boxes_from_grid() will only return a single box and the brick's volume will remain unchanged.
    ## In that case, it's probably best not to uncompress/recompress the brick.
    ## Just create a new brick with the same compressed data and a different logical_box.

    # Iterate over the new boxes that intersect with the original brick
    for destination_box in boxes_from_grid(original_brick.physical_box,
                                           new_grid,
                                           include_halos=True):
        # Physical intersection of original with new
        split_box = box_intersection(destination_box,
                                     original_brick.physical_box)

        # Extract portion of original volume data that belongs to this new box
        split_box_internal = split_box - original_brick.physical_box[0]
        fragment_vol = extract_subvol(original_brick.volume,
                                      split_box_internal)

        # Subtract out halo to get logical_box
        new_logical_box = destination_box - (-new_grid.halo_shape,
                                             new_grid.halo_shape)

        new_location_id = tuple(new_logical_box[0] // new_grid.block_shape)

        fragment_brick = Brick(new_logical_box,
                               split_box,
                               fragment_vol,
                               location_id=new_location_id,
                               compression=original_brick.compression)
        fragment_brick.compress()

        fragments.append(fragment_brick)

    original_brick.compress()
    return fragments
Exemple #26
0
def generate_bricks_from_volume_source(bounding_box,
                                       grid,
                                       volume_accessor_func,
                                       client,
                                       partition_size=None,
                                       sparse_boxes=None,
                                       lazy=False,
                                       compression=None):
    """
    Generate a dask.Bag of Bricks for the given bounding box and grid.

    Args:
        bounding_box:
            (start, stop)

        grid:
            Grid (see above)

        volume_accessor_func:
            Callable with signature: f(box) -> ndarray
            Note: The callable will be unpickled only once per partition, so initialization
                  costs after unpickling are only incurred once per partition.

        client:
            dask.Client

        partition_size:
            Optional. If provided, the dask.Bag will have (approximately) this many bricks per partition.

        sparse_boxes:
            Optional.
            A pre-calculated list of boxes to use instead of instead of calculating
            the complete (dense) list of grid boxes within the bounding box.
            If provided, should be a list of physical boxes, and no two should occupy
            the same logical box, as defined by their midpoints.
            Note: They will still be clipped to the overall bounding_box.

        halo: An integer or shape indicating how much halo to add to each Brick's physical_box.
              The halo is applied in both 'dense' and 'sparse' cases.
    """
    if client is None:
        client = DebugClient()

    if sparse_boxes is None:
        # Generate boxes from densely populated grid
        logical_boxes = boxes_from_grid(bounding_box,
                                        grid,
                                        include_halos=False)
        physical_boxes = boxes_from_grid(bounding_box,
                                         grid,
                                         include_halos=True,
                                         clipped=True)
        assert len(logical_boxes) == len(physical_boxes)
        logical_and_physical_boxes = zip(logical_boxes, physical_boxes)
    else:
        # User provided list of physical boxes.
        # Clip them to the bounding box and calculate the logical boxes.
        if not hasattr(sparse_boxes, '__len__'):
            sparse_boxes = list(sparse_boxes)
        physical_boxes = np.asarray(sparse_boxes)
        assert physical_boxes.ndim == 3 and physical_boxes.shape[1:3] == (2, 3)

        def logical_and_clipped(box):
            midpoint = (box[0] + box[1]) // 2
            logical_box = grid.compute_logical_box(midpoint)
            box += (-grid.halo_shape, grid.halo_shape)
            # Note: Non-intersecting boxes will have non-positive shape after clipping
            clipped_box = box_intersection(box, bounding_box)
            return (logical_box, clipped_box)

        logical_and_physical_boxes = map(logical_and_clipped, physical_boxes)

        # Drop any boxes that fall completely outside the bounding box
        # Check that physical box doesn't completely fall outside its logical_box
        def is_valid(logical_and_physical):
            logical_box, physical_box = logical_and_physical
            return (physical_box[1] > logical_box[0]).all() and (
                physical_box[0] < logical_box[1]).all()

        logical_and_physical_boxes = filter(is_valid,
                                            logical_and_physical_boxes)

    if not hasattr(logical_and_physical_boxes, '__len__'):
        logical_and_physical_boxes = list(
            logical_and_physical_boxes)  # need len()

    num_bricks = len(logical_and_physical_boxes)
    if num_bricks == 0:
        return dask.bag.from_sequence([]), num_bricks

    if partition_size is None:
        partition_size = 1

    partition_size = max(1, partition_size)

    # If we're working with a tiny volume (e.g. testing),
    # make sure we at least parallelize across all cores.
    total_cores = sum(client.ncores().values())
    if (num_bricks // partition_size) < total_cores:
        partition_size = num_bricks // total_cores

    partition_size = max(1, partition_size)

    def brick_size(log_phys):
        _logical, physical = log_phys
        return np.uint64(np.prod(physical[1] - physical[0]))

    num_partitions = int(
        np.ceil(len(logical_and_physical_boxes) / partition_size))

    # Avoid powers-of-two partition sizes, since they hash poorly.
    if num_partitions != 1 and 2**np.log2(num_partitions) == num_partitions:
        #logger.info("Changing num_partitions to avoid power of two")
        if num_partitions < len(logical_and_physical_boxes):
            num_partitions += 1
        else:
            num_partitions -= 1

    # Distribute data across the cluster NOW, to force even distribution.
    boxes_bag = dask.bag.from_sequence(logical_and_physical_boxes,
                                       npartitions=num_partitions)
    if boxes_bag.npartitions != num_partitions:
        boxes_bag = boxes_bag.repartition(num_partitions)

    # Trying different things (scatter, non-scatter...) to work around this issue (I think).
    # https://github.com/dask/distributed/issues/3703#issuecomment-619446739
    with Timer() as scatter_timer:
        boxes_bag = client.scatter(boxes_bag).result()
    time.sleep(2.0)

    boxes_bag = boxes_bag.persist()
    boxes_bag.compute()

    total_volume = sum(map(brick_size, logical_and_physical_boxes))
    logger.info(
        f"Initializing bag of {num_bricks} Bricks "
        f"(over {boxes_bag.npartitions} partitions) with total volume {total_volume/1e9:.1f} Gvox "
    )
    #f"(scatter took {scatter_timer.timedelta})")

    if not isinstance(
            client, DebugClient) and os.environ.get("DEBUG_FLOW", "0") != "0":

        def worker_address(part):
            from distributed import get_worker
            return [(get_worker().address, len(part))]

        workers_and_lens = boxes_bag.map_partitions(worker_address).compute()
        logger.info("Workers and assigned partition lengths:")
        for worker, length in sorted(workers_and_lens):
            logger.info(f"{worker}: {length}")

    def make_brick(logical_and_physical_box):
        logical_box, physical_box = logical_and_physical_box

        # See comment in Brick.__init__
        logical_box.flags['WRITEABLE'] = False
        physical_box.flags['WRITEABLE'] = False

        location_id = tuple(logical_box[0] // grid.block_shape)
        if lazy:
            return Brick(logical_box,
                         physical_box,
                         location_id=location_id,
                         lazy_creation_fn=volume_accessor_func,
                         compression=compression)
        else:
            volume = volume_accessor_func(physical_box)
            return Brick(logical_box,
                         physical_box,
                         volume,
                         location_id=location_id,
                         compression=compression)

    def make_partition_bricks(part):
        newpart = []
        for item in part:
            newpart.append(make_brick(item))
        return newpart

    bricks = boxes_bag.map_partitions(make_partition_bricks)
    return bricks, num_bricks