def parallelize_bounding_box( self, instance_name, bounding_box_zyx, grid, target_partition_size_voxels ): """ Create an RDD for the given data instance (of either grayscale, labelblk, labelarray, or labelmap), within the given bounding_box (start_zyx, stop_zyx) and split into blocks of the given shape. The RDD parallelism will be set to include approximately target_partition_size_voxels in total. """ block_size_voxels = np.prod(grid.block_shape) rdd_partition_length = target_partition_size_voxels // block_size_voxels bricks = generate_bricks_from_volume_source( bounding_box_zyx, grid, self.get_volume_accessor(instance_name), self.sc, rdd_partition_length ) # If we're working with a tiny volume (e.g. testing), # make sure we at least parallelize across all cores. if bricks.getNumPartitions() < cpus_per_worker() * num_worker_nodes(): bricks = bricks.repartition( cpus_per_worker() * num_worker_nodes() ) return bricks
def _sanitize_config(self): """ Tidy up some config values, and fill in 'auto' values where needed. """ input_config = self.config_data["input"] output_config = self.config_data["output"] options = self.config_data["options"] # Initialize dummy input/output services, just to overwrite 'auto' config values as needed. VolumeService.create_from_config(input_config, self.config_dir) # Output bounding-box must match exactly (or left as auto) input_bb_zyx = self.input_service.bounding_box_zyx output_bb_zyx = self.output_service.bounding_box_zyx assert ((output_bb_zyx == input_bb_zyx) | (output_bb_zyx == -1)).all(), \ "Output bounding box must match the input bounding box exactly. (No translation permitted)." assert output_config["slice-files"]["slice-xy-offset"] == [ 0, 0 ], "Nonzero xy offset is meaningless for outputs." if options["slices-per-slab"] == -1: # Auto-choose a depth that keeps all threads busy with at least one slice brick_shape_zyx = self.input_service.preferred_message_shape brick_depth = brick_shape_zyx[0] assert brick_depth != -1 num_threads = num_worker_nodes() * cpus_per_worker() threads_per_brick_layer = ( (num_threads + brick_depth - 1) // brick_depth) # round up options["slices-per-slab"] = brick_depth * threads_per_brick_layer
def _sanitize_config(self): """ Tidy up some config values, and fill in 'auto' values where needed. """ input_config = self.config_data["input"] output_config = self.config_data["output"] options = self.config_data["options"] # Initialize dummy input/output services, just to overwrite 'auto' config values as needed. VolumeService.create_from_config( input_config, self.config_dir ) # Output bounding-box must match exactly (or left as auto) input_bb_zyx = self.input_service.bounding_box_zyx output_bb_zyx = self.output_service.bounding_box_zyx assert ((output_bb_zyx == input_bb_zyx) | (output_bb_zyx == -1)).all(), \ "Output bounding box must match the input bounding box exactly. (No translation permitted)." assert output_config["slice-files"]["slice-xy-offset"] == [0,0], "Nonzero xy offset is meaningless for outputs." if options["slices-per-slab"] == -1: # Auto-choose a depth that keeps all threads busy with at least one slice brick_shape_zyx = self.input_service.preferred_message_shape brick_depth = brick_shape_zyx[0] assert brick_depth != -1 num_threads = num_worker_nodes() * cpus_per_worker() threads_per_brick_layer = ((num_threads + brick_depth-1) // brick_depth) # round up options["slices-per-slab"] = brick_depth * threads_per_brick_layer
def _execute_labelindices(self, mapping_df): config = self.config_data options = config["options"] resource_manager_client = ResourceManagerClient( options["resource-server"], options["resource-port"]) last_mutid = options["mutation-id"] server = config["dvid"]["server"] uuid = config["dvid"]["uuid"] instance_name = config["dvid"]["segmentation-name"] endpoint = f'{server}/api/node/{uuid}/{instance_name}/indices' processor = StatsBatchProcessor(last_mutid, endpoint) # Load the h5 file block_sv_stats = load_stats_h5_to_records(config["block-stats-file"]) # Note: Initializing this generator involves sorting the (very large) stats array batch_rows = options["batch-row-count"] batch_generator = generate_stats_batches(block_sv_stats, mapping_df, batch_rows) batches = self.sc.parallelize(batch_generator, cpus_per_worker() * num_worker_nodes()) rt.persist_and_execute(batches, "Distributing batches", logger) def process_batch(item): stats_batch, total_rows = item approximate_bytes = 30 * total_rows # this is highly unscientific with resource_manager_client.access_context( server, False, 1, approximate_bytes): processor.process_batch((stats_batch, total_rows)) with Timer("Processing/sending batches", logger): batches.foreach(process_batch)
def compute_stats(block_shape, concurrent_threads, df): total_blocks = df['voxel_count'].sum() / np.prod(block_shape) stats = { "total-analyzed-requests": len(df), "num-workers": num_worker_nodes(), "available-threads": 16 * num_worker_nodes(), "concurrent-threads": concurrent_threads, "approx-requests-per-thread": len(df) / (concurrent_threads), "blocks-per-request": total_blocks / len(df), "seconds-per-request": df['seconds'].mean(), "seconds-per-block": df['seconds'].sum() / total_blocks, "wall-time": (df['timestamp'].iloc[-1] - df['timestamp'].iloc[0]).seconds + df['seconds'].iloc[-1], } return stats
def group_by_body(self, segments_and_meshes): config = self.config_data # Group according to scheme grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] n_partitions = num_worker_nodes() * cpus_per_worker() if grouping_scheme in "hundreds": def last_six_digits(id_mesh): body_id, _mesh = id_mesh group_id = body_id - (body_id % 100) return group_id grouped_body_ids_and_meshes = segments_and_meshes.groupBy( last_six_digits, numPartitions=n_partitions) elif grouping_scheme == "labelmap": import pandas as pd mapping_pairs = self.load_labelmap() def prepend_mapped_group_id(id_mesh_partition): df = pd.DataFrame(mapping_pairs, columns=["body_id", "group_id"]) new_partition = [] for id_mesh in id_mesh_partition: body_id, mesh = id_mesh rows = df.loc[df.body_id == body_id] if len(rows) == 0: # If missing from labelmap, # we assume an implicit identity mapping group_id = body_id else: group_id = rows['group_id'].iloc[0] new_partition.append((group_id, (body_id, mesh))) return new_partition # We do this via mapPartitions().groupByKey() instead of a simple groupBy() # to save time constructing the DataFrame inside the closure above. # (TODO: Figure out why the dataframe isn't pickling properly...) skip_groups = set(config["mesh-config"]["storage"]["skip-groups"]) grouped_body_ids_and_meshes = segments_and_meshes.mapPartitions( prepend_mapped_group_id ) \ .filter(lambda item: item[0] not in skip_groups) \ .groupByKey(numPartitions=n_partitions) elif grouping_scheme in ("singletons", "no-groups"): # Create 'groups' of one item each, re-using the body ID as the group id. # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.) grouped_body_ids_and_meshes = segments_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])])) persist_and_execute( grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger) return grouped_body_ids_and_meshes
def from_accessor_func(cls, bounding_box, grid, volume_accessor_func=None, sc=None, target_partition_size_voxels=None, sparse_boxes=None, lazy=False): """ Convenience constructor, taking an arbitrary volume_accessor_func. Args: bounding_box: (start, stop) grid: Grid (see brick.py) volume_accessor_func: Callable with signature: f(box) -> ndarray Note: The callable will be unpickled only once per partition, so initialization costs after unpickling are only incurred once per partition. sc: SparkContext. If provided, an RDD is returned. Otherwise, returns an ordinary Python iterable. target_partition_size_voxels: Optional. If provided, the RDD partition lengths (i.e. the number of bricks per RDD partition) will be chosen to have (approximately) this many total voxels in each partition. sparse_boxes: A list of (physical) sparse boxes indicating which bricks should actually be present in the BrickWall. If not provided, all bricks within the bounding_box will be present. lazy: If True, the bricks' data will not be created until their 'volume' member is first accessed. """ if target_partition_size_voxels is None: if sc: num_threads = num_worker_nodes() * cpus_per_worker() else: # See RDDtools -- for now, non-spark pseudo-RDDs are just a single partition. num_threads = 1 if sparse_boxes is None: total_voxels = np.prod(bounding_box[1] - bounding_box[0]) else: if not hasattr(sparse_boxes, '__len__'): sparse_boxes = list(sparse_boxes) total_voxels = sum( map(lambda physbox: np.prod(physbox[1] - physbox[0]), sparse_boxes ) ) voxels_per_thread = total_voxels / num_threads target_partition_size_voxels = (voxels_per_thread // 2) # Arbitrarily aim for 2 partitions per thread block_size_voxels = np.prod(grid.block_shape) rdd_partition_length = target_partition_size_voxels // block_size_voxels bricks = generate_bricks_from_volume_source(bounding_box, grid, volume_accessor_func, sc, rdd_partition_length, sparse_boxes, lazy) return BrickWall( bounding_box, grid, bricks )
def group_by_body(self, segments_and_meshes): config = self.config_data # Group according to scheme grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] n_partitions = num_worker_nodes() * cpus_per_worker() if grouping_scheme in "hundreds": def last_six_digits( id_mesh ): body_id, _mesh = id_mesh group_id = body_id - (body_id % 100) return group_id grouped_body_ids_and_meshes = segments_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions) elif grouping_scheme == "labelmap": import pandas as pd mapping_pairs = self.load_labelmap() def prepend_mapped_group_id( id_mesh_partition ): df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] ) new_partition = [] for id_mesh in id_mesh_partition: body_id, mesh = id_mesh rows = df.loc[df.body_id == body_id] if len(rows) == 0: # If missing from labelmap, # we assume an implicit identity mapping group_id = body_id else: group_id = rows['group_id'].iloc[0] new_partition.append( (group_id, (body_id, mesh)) ) return new_partition # We do this via mapPartitions().groupByKey() instead of a simple groupBy() # to save time constructing the DataFrame inside the closure above. # (TODO: Figure out why the dataframe isn't pickling properly...) skip_groups = set(config["mesh-config"]["storage"]["skip-groups"]) grouped_body_ids_and_meshes = segments_and_meshes.mapPartitions( prepend_mapped_group_id ) \ .filter(lambda item: item[0] not in skip_groups) \ .groupByKey(numPartitions=n_partitions) elif grouping_scheme in ("singletons", "no-groups"): # Create 'groups' of one item each, re-using the body ID as the group id. # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.) grouped_body_ids_and_meshes = segments_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) ) persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger) return grouped_body_ids_and_meshes
def _process_slab(self, scale, slab_fullres_box_zyx, slab_index, num_slabs, upscale_slab_wall): num_threads = num_worker_nodes() * cpus_per_worker() slab_voxels = np.prod(slab_fullres_box_zyx[1] - slab_fullres_box_zyx[0]) // (2**scale)**3 voxels_per_thread = slab_voxels // num_threads options = self.config_data["options"] pyramid_source = options["pyramid-source"] if pyramid_source == "copy" or scale == 0: # Copy from input source bricked_slab_wall = BrickWall.from_volume_service(self.input_service, scale, slab_fullres_box_zyx, self.sc, voxels_per_thread // 2) bricked_slab_wall.persist_and_execute(f"Slab {slab_index}: Downloading scale {scale}", logger) else: # Downsample from previous scale bricked_slab_wall = upscale_slab_wall.downsample( (2,2,2), 'grayscale' ) bricked_slab_wall.persist_and_execute(f"Slab {slab_index}: Downsampling to scale {scale}", logger) upscale_slab_wall.unpersist() del upscale_slab_wall if scale == 0: bricked_slab_wall = self.adjust_contrast(bricked_slab_wall, slab_index) # Remap to output bricks output_grid = Grid(self.output_service.preferred_message_shape) output_slab_wall = bricked_slab_wall.realign_to_new_grid( output_grid ) # Pad from previously-existing pyramid data until # we have full storage blocks, e.g. (64,64,64), # but not necessarily full bricks, e.g. (64,64,6400) output_accessor_func = partial(self.output_service.get_subvolume, scale=scale) # But don't bother fetching real data for scale 0 # the input slabs are already block-aligned, and the edges of each slice will be zeros anyway. if scale == 0: output_accessor_func = lambda _box: 0 padding_grid = Grid( 3*(self.output_service.block_width,), output_grid.offset ) padded_slab_wall = output_slab_wall.fill_missing(output_accessor_func, padding_grid) padded_slab_wall.persist_and_execute(f"Slab {slab_index}: Assembling scale {scale} bricks", logger) # Discard original bricks bricked_slab_wall.unpersist() del bricked_slab_wall logger.info(f"Slab {slab_index}: Writing scale {scale}", extra={"status": f"Writing {slab_index}/{num_slabs}"}) rt.foreach( partial(write_brick, self.output_service, scale), padded_slab_wall.bricks ) return padded_slab_wall
def run_on_each_worker(self, func): """ Run the given function once per worker node. """ status_filepath = '/tmp/' + self._execution_uuid + '-' + str(self._worker_task_id) self._worker_task_id += 1 @self.collect_log(lambda i: socket.gethostname() + '[' + func.__name__ + ']') def task_f(i): with FileLock(status_filepath): if os.path.exists(status_filepath): return None # create empty file to indicate the task was executed open(status_filepath, 'w') result = func() return (socket.gethostname(), result) num_workers = num_worker_nodes() # It would be nice if we only had to schedule N tasks for N workers, # but we couldn't ensure that tasks are hashed 1-to-1 onto workers. # Instead, we'll schedule **LOTS** of extra tasks, but the logic in # task_f() will skip the unnecessary work. num_tasks = num_workers * 1000 # Execute the tasks. Returns [(hostname, result), None, None, (hostname, result), ...], # with 'None' interspersed for hosts that were hit multiple times. # (Each host only returns a single non-None result) host_results = self.sc.parallelize(list(range(num_tasks)), num_tasks)\ .repartition(num_tasks).map(task_f).collect() host_results = [_f for _f in host_results if _f] # Drop Nones host_results = dict(host_results) assert len(host_results) == num_workers, \ "Task '{}' was not executed all workers ({}), or some tasks failed! Nodes processed: \n{}"\ .format(func.__name__, num_workers, host_results) logger.info("Ran {} on {} nodes: {}".format(func.__name__, len(host_results), host_results)) return host_results
def _execute_mesh_generation(self, large_id_box_mask_factor_err): config = self.config_data @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS') def logged_generate_mesh(arg): return generate_mesh_in_subprocess(config, arg) # --> (body_id, mesh_bytes, error_msg) body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map( logged_generate_mesh) persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes", logger) # Errors were already written to a separate file, but let's duplicate them in the master log. errors = body_ids_and_meshes_with_err.map( lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect() for error in errors: logger.error(error) # Filter out error cases body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \ .map( lambda id_mesh_err: id_mesh_err[:2] ) # Group according to scheme grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] n_partitions = num_worker_nodes() * cpus_per_worker() if grouping_scheme in "hundreds": def last_six_digits(id_mesh): body_id, _mesh = id_mesh group_id = body_id - (body_id % 100) return group_id grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy( last_six_digits, numPartitions=n_partitions) elif grouping_scheme == "labelmap": import pandas as pd mapping_pairs = load_labelmap( config["mesh-config"]["storage"]["labelmap"], self.config_dir) def prepend_mapped_group_id(id_mesh_partition): df = pd.DataFrame(mapping_pairs, columns=["body_id", "group_id"]) new_partition = [] for id_mesh in id_mesh_partition: body_id, mesh = id_mesh rows = df.loc[df.body_id == body_id] if len(rows) == 0: # If missing from labelmap, # we assume an implicit identity mapping group_id = body_id else: group_id = rows['group_id'].iloc[0] new_partition.append((group_id, (body_id, mesh))) return new_partition # We do this via mapPartitions().groupByKey() instead of a simple groupBy() # to save time constructing the DataFrame inside the closure above. # (TODO: Figure out why the dataframe isn't pickling properly...) skip_groups = set(config["mesh-config"]["storage"]["skip-groups"]) grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \ .filter(lambda item: item[0] not in skip_groups) \ .groupByKey(numPartitions=n_partitions) elif grouping_scheme in ("singletons", "no-groups"): # Create 'groups' of one item each, re-using the body ID as the group id. # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.) grouped_body_ids_and_meshes = body_ids_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])])) persist_and_execute( grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger) unpersist(body_ids_and_meshes) del body_ids_and_meshes with Timer() as timer: grouped_body_ids_and_meshes.foreachPartition( partial(post_meshes_to_dvid, config)) logger.info(f"Writing meshes to DVID took {timer.seconds}")
def execute(self): self._init_services() self._sanitize_config() options = self.config_data["options"] output_service = self.output_service logger.info(f"Output bounding box: {output_service.bounding_box_zyx[:,::-1]}") # Data is processed in Z-slabs slab_depth = options["slices-per-slab"] input_bb_zyx = self.input_service.bounding_box_zyx _, slice_start_y, slice_start_x = input_bb_zyx[0] slab_shape_zyx = input_bb_zyx[1] - input_bb_zyx[0] slab_shape_zyx[0] = slab_depth slice_shape_zyx = slab_shape_zyx.copy() slice_shape_zyx[0] = 1 # This grid outlines the slabs -- each grid box is a full slab slab_grid = Grid(slab_shape_zyx, (0, slice_start_y, slice_start_x)) slab_boxes = list(clipped_boxes_from_grid(input_bb_zyx, slab_grid)) for slab_index, slab_box_zyx in enumerate(slab_boxes): # Contruct BrickWall from input bricks num_threads = num_worker_nodes() * cpus_per_worker() slab_voxels = np.prod(slab_box_zyx[1] - slab_box_zyx[0]) voxels_per_thread = slab_voxels / num_threads bricked_slab_wall = BrickWall.from_volume_service(self.input_service, 0, slab_box_zyx, self.sc, voxels_per_thread / 2) # Force download bricked_slab_wall.persist_and_execute(f"Downloading slab {slab_index}/{len(slab_boxes)}: {slab_box_zyx[:,::-1]}", logger) # Remap to slice-sized "bricks" sliced_grid = Grid(slice_shape_zyx, offset=slab_box_zyx[0]) sliced_slab_wall = bricked_slab_wall.realign_to_new_grid( sliced_grid ) sliced_slab_wall.persist_and_execute(f"Assembling slab {slab_index}/{len(slab_boxes)} slices", logger) # Discard original bricks bricked_slab_wall.unpersist() del bricked_slab_wall def write_slice(brick): assert (brick.physical_box == brick.logical_box).all() output_service.write_subvolume(brick.volume, brick.physical_box[0]) # Export to PNG or TIFF, etc. (automatic via slice path extension) with Timer() as timer: logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)}", extra={"status": f"Exporting {slab_index}/{len(slab_boxes)}"}) rt.foreach( write_slice, sliced_slab_wall.bricks ) logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)} took {timer.timedelta}", extra={"status": f"Done: {slab_index}/{len(slab_boxes)}"}) # Discard slice data sliced_slab_wall.unpersist() del sliced_slab_wall logger.info(f"DONE exporting {len(slab_boxes)} slabs.", extra={'status': "DONE"})
def generate_bricks_from_volume_source( bounding_box, grid, volume_accessor_func, sc=None, rdd_partition_length=None, sparse_boxes=None, lazy=False ): """ Generate an RDD or iterable of Bricks for the given bounding box and grid. Args: bounding_box: (start, stop) grid: Grid (see above) volume_accessor_func: Callable with signature: f(box) -> ndarray Note: The callable will be unpickled only once per partition, so initialization costs after unpickling are only incurred once per partition. sc: SparkContext. If provided, an RDD is returned. Otherwise, returns an ordinary Python iterable. rdd_partition_length: Optional. If provided, the RDD will have (approximately) this many bricks per partition. sparse_boxes: Optional. A pre-calculated list of boxes to use instead of instead of calculating the complete (dense) list of grid boxes within the bounding box. If provided, should be a list of physical boxes, and no two should occupy the same logical box, as defined by their midpoints. Note: They will still be clipped to the overall bounding_box. halo: An integer or shape indicating how much halo to add to each Brick's physical_box. The halo is applied in both 'dense' and 'sparse' cases. """ if sparse_boxes is None: # Generate boxes from densely populated grid logical_boxes = boxes_from_grid(bounding_box, grid, include_halos=False) physical_boxes = clipped_boxes_from_grid(bounding_box, grid) logical_and_physical_boxes = zip( logical_boxes, physical_boxes ) else: # User provided list of physical boxes. # Clip them to the bounding box and calculate the logical boxes. if not hasattr(sparse_boxes, '__len__'): sparse_boxes = list( sparse_boxes ) physical_boxes = np.asarray( sparse_boxes ) assert physical_boxes.ndim == 3 and physical_boxes.shape[1:3] == (2,3) def logical_and_clipped( box ): midpoint = (box[0] + box[1]) // 2 logical_box = grid.compute_logical_box( midpoint ) box += (-grid.halo_shape, grid.halo_shape) # Note: Non-intersecting boxes will have non-positive shape after clipping clipped_box = box_intersection(box, bounding_box) return ( logical_box, clipped_box ) logical_and_physical_boxes = map(logical_and_clipped, physical_boxes) # Drop any boxes that fall completely outside the bounding box # Check that physical box doesn't completely fall outside its logical_box def is_valid(logical_and_physical): logical_box, physical_box = logical_and_physical return (physical_box[1] > logical_box[0]).all() and (physical_box[0] < logical_box[1]).all() logical_and_physical_boxes = filter(is_valid, logical_and_physical_boxes ) if sc: if not hasattr(logical_and_physical_boxes, '__len__'): logical_and_physical_boxes = list(logical_and_physical_boxes) # need len() num_rdd_partitions = None if rdd_partition_length is not None: rdd_partition_length = max(1, rdd_partition_length) num_rdd_partitions = int( np.ceil( len(logical_and_physical_boxes) / rdd_partition_length ) ) # If we're working with a tiny volume (e.g. testing), # make sure we at least parallelize across all cores. if num_rdd_partitions is not None and (num_rdd_partitions < cpus_per_worker() * num_worker_nodes()): num_rdd_partitions = cpus_per_worker() * num_worker_nodes() def brick_size(log_phys): _logical, physical = log_phys return np.uint64(np.prod(physical[1] - physical[0])) total_volume = sum(map(brick_size, logical_and_physical_boxes)) logger.info(f"Initializing RDD of {len(logical_and_physical_boxes)} Bricks " f"(over {num_rdd_partitions} partitions) with total volume {total_volume/1e9:.1f} Gvox") # Enumerate and repartition to get perfect partition sizes, # rather than relying on spark's default hash class _enumerated_value(tuple): # Return a hash based on the key alone. def __hash__(self): return self[0] enumerated_logical_and_physical_boxes = sc.parallelize( enumerate(logical_and_physical_boxes), num_rdd_partitions ) enumerated_logical_and_physical_boxes = enumerated_logical_and_physical_boxes.map(_enumerated_value) enumerated_logical_and_physical_boxes = enumerated_logical_and_physical_boxes.partitionBy(num_rdd_partitions, lambda x: x) logical_and_physical_boxes = enumerated_logical_and_physical_boxes.values() def make_bricks( logical_and_physical_box ): logical_box, physical_box = logical_and_physical_box if lazy: return Brick(logical_box, physical_box, lazy_creation_fn=volume_accessor_func) else: volume = volume_accessor_func(physical_box) return Brick(logical_box, physical_box, volume) bricks = rt.map( make_bricks, logical_and_physical_boxes ) return bricks
def execute(self): self._init_services() self._sanitize_config() options = self.config_data["options"] output_service = self.output_service logger.info( f"Output bounding box: {output_service.bounding_box_zyx[:,::-1]}") # Data is processed in Z-slabs slab_depth = options["slices-per-slab"] input_bb_zyx = self.input_service.bounding_box_zyx _, slice_start_y, slice_start_x = input_bb_zyx[0] slab_shape_zyx = input_bb_zyx[1] - input_bb_zyx[0] slab_shape_zyx[0] = slab_depth slice_shape_zyx = slab_shape_zyx.copy() slice_shape_zyx[0] = 1 # This grid outlines the slabs -- each grid box is a full slab slab_grid = Grid(slab_shape_zyx, (0, slice_start_y, slice_start_x)) slab_boxes = list(clipped_boxes_from_grid(input_bb_zyx, slab_grid)) for slab_index, slab_box_zyx in enumerate(slab_boxes): # Contruct BrickWall from input bricks num_threads = num_worker_nodes() * cpus_per_worker() slab_voxels = np.prod(slab_box_zyx[1] - slab_box_zyx[0]) voxels_per_thread = slab_voxels / num_threads bricked_slab_wall = BrickWall.from_volume_service( self.input_service, 0, slab_box_zyx, self.sc, voxels_per_thread / 2) # Force download bricked_slab_wall.persist_and_execute( f"Downloading slab {slab_index}/{len(slab_boxes)}: {slab_box_zyx[:,::-1]}", logger) # Remap to slice-sized "bricks" sliced_grid = Grid(slice_shape_zyx, offset=slab_box_zyx[0]) sliced_slab_wall = bricked_slab_wall.realign_to_new_grid( sliced_grid) sliced_slab_wall.persist_and_execute( f"Assembling slab {slab_index}/{len(slab_boxes)} slices", logger) # Discard original bricks bricked_slab_wall.unpersist() del bricked_slab_wall def write_slice(brick): assert (brick.physical_box == brick.logical_box).all() output_service.write_subvolume(brick.volume, brick.physical_box[0]) # Export to PNG or TIFF, etc. (automatic via slice path extension) with Timer() as timer: logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)}", extra={ "status": f"Exporting {slab_index}/{len(slab_boxes)}" }) rt.foreach(write_slice, sliced_slab_wall.bricks) logger.info( f"Exporting slab {slab_index}/{len(slab_boxes)} took {timer.timedelta}", extra={"status": f"Done: {slab_index}/{len(slab_boxes)}"}) # Discard slice data sliced_slab_wall.unpersist() del sliced_slab_wall logger.info(f"DONE exporting {len(slab_boxes)} slabs.", extra={'status': "DONE"})
def execute(self): from pyspark import StorageLevel self._sanitize_config() config = self.config_data options = config["options"] resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"]) total_cpus = 16 * num_worker_nodes() concurrent_threads = total_cpus if options["resource-server"]: concurrent_threads = options["resource-server-config"]["read_reqs"] if concurrent_threads > total_cpus: msg = "You're attempting to use the resource manager to constrain concurrency, but you "\ "aren't running with a large enough cluster to saturate the resource manager settings" raise RuntimeError(msg) # We instantiate a VolumeService as an easy way to plug in missing config values as necessary. # (We won't actually use it.) volume_service = VolumeService.create_from_config(config["input"], self.config_dir) server = volume_service.server uuid = volume_service.uuid instance = volume_service.instance_name block_shape = 3*(volume_service.block_width,) def timed_fetch_blocks_from_box(box): """ Fetch the blocks for a given box and return the time it took to fetch them. Do not bother decompressing the blocks or combining them into a single volume. """ assert not (box % block_shape).any(), "For this test, all requests must be block-aligned" block_boxes = list( boxes_from_grid(box, Grid(block_shape)) ) block_coords_xyz = np.array(block_boxes)[:,0,::-1] // block_shape block_coords_str = ','.join(map(str, block_coords_xyz.flat)) voxel_count = np.prod(box[1] - box[0]) session = default_dvid_session() url = f'{server}/api/node/{uuid}/{instance}/specificblocks?blocks={block_coords_str}' with resource_mgr_client.access_context(server, True, 1, voxel_count): timestamp = datetime.now() with Timer() as timer: r = session.get(url) r.raise_for_status() return timestamp, voxel_count, len(r.content), timer.seconds # This hash-related hackery is to ensure uniform partition lengths, which Spark is bad at by default. boxes = list(clipped_boxes_from_grid( volume_service.bounding_box_zyx, Grid(volume_service.preferred_message_shape) )) indexed_boxes = list(map(rt.tuple_with_hash, (enumerate(boxes)))) for i_box in indexed_boxes: i_box.set_hash(i_box[0]) rdd_boxes = self.sc.parallelize(indexed_boxes).values() timestamps_voxels_sizes_times = rdd_boxes.map(timed_fetch_blocks_from_box) # The only reason I'm persisting this is to see the partition distribution in the log. rt.persist_and_execute(timestamps_voxels_sizes_times, "Fetching blocks", logger, StorageLevel.MEMORY_ONLY) #@UndefinedVariable # Execute the workload timestamps, voxels, sizes, times = zip( *timestamps_voxels_sizes_times.collect() ) # Process the results self.dump_stats(timestamps, voxels, sizes, times, block_shape, concurrent_threads)
def _execute_mesh_generation(self, large_id_box_mask_factor_err): config = self.config_data @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS') def logged_generate_mesh(arg): return generate_mesh_in_subprocess(config, arg) # --> (body_id, mesh_bytes, error_msg) body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map( logged_generate_mesh ) persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes", logger) # Errors were already written to a separate file, but let's duplicate them in the master log. errors = body_ids_and_meshes_with_err.map(lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect() for error in errors: logger.error(error) # Filter out error cases body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \ .map( lambda id_mesh_err: id_mesh_err[:2] ) # Group according to scheme grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] n_partitions = num_worker_nodes() * cpus_per_worker() if grouping_scheme in "hundreds": def last_six_digits( id_mesh ): body_id, _mesh = id_mesh group_id = body_id - (body_id % 100) return group_id grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions) elif grouping_scheme == "labelmap": import pandas as pd mapping_pairs = load_labelmap( config["mesh-config"]["storage"]["labelmap"], self.config_dir ) def prepend_mapped_group_id( id_mesh_partition ): df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] ) new_partition = [] for id_mesh in id_mesh_partition: body_id, mesh = id_mesh rows = df.loc[df.body_id == body_id] if len(rows) == 0: # If missing from labelmap, # we assume an implicit identity mapping group_id = body_id else: group_id = rows['group_id'].iloc[0] new_partition.append( (group_id, (body_id, mesh)) ) return new_partition # We do this via mapPartitions().groupByKey() instead of a simple groupBy() # to save time constructing the DataFrame inside the closure above. # (TODO: Figure out why the dataframe isn't pickling properly...) skip_groups = set(config["mesh-config"]["storage"]["skip-groups"]) grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \ .filter(lambda item: item[0] not in skip_groups) \ .groupByKey(numPartitions=n_partitions) elif grouping_scheme in ("singletons", "no-groups"): # Create 'groups' of one item each, re-using the body ID as the group id. # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.) grouped_body_ids_and_meshes = body_ids_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) ) persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger) unpersist(body_ids_and_meshes) del body_ids_and_meshes with Timer() as timer: grouped_body_ids_and_meshes.foreachPartition( partial(post_meshes_to_dvid, config) ) logger.info(f"Writing meshes to DVID took {timer.seconds}")
def execute(self): from pyspark import StorageLevel self._sanitize_config() config = self.config_data options = config["options"] resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"]) total_cpus = 16 * num_worker_nodes() concurrent_threads = total_cpus if options["resource-server"]: concurrent_threads = options["resource-server-config"]["read_reqs"] if concurrent_threads > total_cpus: msg = "You're attempting to use the resource manager to constrain concurrency, but you "\ "aren't running with a large enough cluster to saturate the resource manager settings" raise RuntimeError(msg) # We instantiate a VolumeService as an easy way to plug in missing config values as necessary. # (We won't actually use it.) volume_service = VolumeService.create_from_config( config["input"], self.config_dir) server = volume_service.server uuid = volume_service.uuid instance = volume_service.instance_name block_shape = 3 * (volume_service.block_width, ) def timed_fetch_blocks_from_box(box): """ Fetch the blocks for a given box and return the time it took to fetch them. Do not bother decompressing the blocks or combining them into a single volume. """ assert not (box % block_shape).any( ), "For this test, all requests must be block-aligned" block_boxes = list(boxes_from_grid(box, Grid(block_shape))) block_coords_xyz = np.array(block_boxes)[:, 0, ::-1] // block_shape block_coords_str = ','.join(map(str, block_coords_xyz.flat)) voxel_count = np.prod(box[1] - box[0]) session = default_dvid_session() url = f'{server}/api/node/{uuid}/{instance}/specificblocks?blocks={block_coords_str}' with resource_mgr_client.access_context(server, True, 1, voxel_count): timestamp = datetime.now() with Timer() as timer: r = session.get(url) r.raise_for_status() return timestamp, voxel_count, len(r.content), timer.seconds # This hash-related hackery is to ensure uniform partition lengths, which Spark is bad at by default. boxes = list( clipped_boxes_from_grid( volume_service.bounding_box_zyx, Grid(volume_service.preferred_message_shape))) indexed_boxes = list(map(rt.tuple_with_hash, (enumerate(boxes)))) for i_box in indexed_boxes: i_box.set_hash(i_box[0]) rdd_boxes = self.sc.parallelize(indexed_boxes).values() timestamps_voxels_sizes_times = rdd_boxes.map( timed_fetch_blocks_from_box) # The only reason I'm persisting this is to see the partition distribution in the log. rt.persist_and_execute(timestamps_voxels_sizes_times, "Fetching blocks", logger, StorageLevel.MEMORY_ONLY) #@UndefinedVariable # Execute the workload timestamps, voxels, sizes, times = zip( *timestamps_voxels_sizes_times.collect()) # Process the results self.dump_stats(timestamps, voxels, sizes, times, block_shape, concurrent_threads)