def export_supervoxel_stats(h5_path, output_csv_path, delimiter=' '): block_sv_stats = load_stats_h5_to_records(h5_path, False) with Timer(f"Sorting {len(block_sv_stats)} block stats", logger): block_sv_stats.sort(order=['segment_id', 'z', 'y', 'x', 'count']) with Timer(f"Converting coordinates to block indexes", logger): _convert_coords_to_block_indexes(block_sv_stats) _export_csv(block_sv_stats, output_csv_path)
def persist_and_execute(rdd, description, logger=None, storage=None): """ Persist and execute the given RDD or iterable. The persisted RDD is returned (in the case of an iterable, it may not be the original) """ if logger: logger.info(f"{description}...") with Timer() as timer: if isinstance(rdd, _RDD): if storage is None: from pyspark import StorageLevel storage = StorageLevel.MEMORY_ONLY rdd.persist(storage) count = rdd.count() # force eval parts = rdd.getNumPartitions() partition_counts = rdd.mapPartitions( lambda part: [sum(1 for _ in part)]).collect() histogram = defaultdict(lambda: 0) for c in partition_counts: histogram[c] += 1 histogram = dict(histogram) else: rdd = list(rdd) # force eval and 'persist' in a new list count = len(rdd) parts = 1 histogram = {count: 1} if logger: logger.info( f"{description} (N={count}, P={parts}, P_hist={histogram}) took {timer.timedelta}" ) return rdd
def export_body_stats(h5_path, mapping_csv, output_csv_path, delimiter=' '): mapping_pairs = load_edge_csv(mapping_csv) segment_to_body_df = pd.DataFrame(mapping_pairs, columns=['segment_id', 'body_id']) block_sv_stats = load_stats_h5_to_records(h5_path, True) _overwrite_body_id_column(block_sv_stats, segment_to_body_df) with Timer(f"Sorting {len(block_sv_stats)} block stats", logger): block_sv_stats.sort( order=['body_id', 'segment_id', 'z', 'y', 'x', 'count']) with Timer(f"Converting coordinates to block indexes", logger): _convert_coords_to_block_indexes(block_sv_stats) _export_csv(block_sv_stats, output_csv_path)
def _execute_labelindices(self, mapping_df): config = self.config_data options = config["options"] resource_manager_client = ResourceManagerClient( options["resource-server"], options["resource-port"]) last_mutid = options["mutation-id"] server = config["dvid"]["server"] uuid = config["dvid"]["uuid"] instance_name = config["dvid"]["segmentation-name"] endpoint = f'{server}/api/node/{uuid}/{instance_name}/indices' processor = StatsBatchProcessor(last_mutid, endpoint) # Load the h5 file block_sv_stats = load_stats_h5_to_records(config["block-stats-file"]) # Note: Initializing this generator involves sorting the (very large) stats array batch_rows = options["batch-row-count"] batch_generator = generate_stats_batches(block_sv_stats, mapping_df, batch_rows) batches = self.sc.parallelize(batch_generator, cpus_per_worker() * num_worker_nodes()) rt.persist_and_execute(batches, "Distributing batches", logger) def process_batch(item): stats_batch, total_rows = item approximate_bytes = 30 * total_rows # this is highly unscientific with resource_manager_client.access_context( server, False, 1, approximate_bytes): processor.process_batch((stats_batch, total_rows)) with Timer("Processing/sending batches", logger): batches.foreach(process_batch)
def _export_csv(stats, output_csv_path): if os.path.exists(output_csv_path): os.unlink(output_csv_path) with Timer(f"Writing sorted stats to {output_csv_path}", logger): chunk_size = 10_000_000 for row_start in tqdm(range(0, len(stats), chunk_size)): row_stop = min(row_start + chunk_size, len(stats)) df = pd.DataFrame(stats[row_start:row_stop]) df.to_csv(output_csv_path, sep=' ', header=False, index=False, mode='a')
def _execute_mappings(self, mapping_df): config = self.config_data if mapping_df is None: raise RuntimeError( "Can't load mappings: No agglomeration mapping provided.") # Just do this from a single machine (the driver), with a big batch size # The writes are serialized on the DVID side, anyway. with Timer("Sending mapping", logger): ingest_mapping(config["dvid"]["server"], config["dvid"]["uuid"], config["dvid"]["segmentation-name"], config["options"]["mutation-id"], mapping_df, batch_size=100_000, show_progress_bar=False, session=default_dvid_session())
def skeletonize(config, body_id, combined_box, combined_mask, downsample_factor): (combined_box_start, _combined_box_stop) = combined_box # This config factor is an option to artificially scale the meshes up before # writing them, on top of whatever amount the data was downsampled. rescale_factor = config["options"]["rescale-before-write"] downsample_factor *= rescale_factor combined_box = combined_box * rescale_factor with Timer() as timer: # FIXME: Should the skeleton-config be tweaked in any way based on the downsample_factor?? tree = skeletonize_array(combined_mask, config["skeleton-config"]) tree.rescale(downsample_factor, downsample_factor, downsample_factor, True) tree.translate(*combined_box_start.astype( np.float64)[::-1]) # Pass x,y,z, not z,y,x del combined_mask swc_contents = "# {:%Y-%m-%d %H:%M:%S}\n".format(datetime.now()) swc_contents += "# Generated by the DVIDSparkServices 'CreateSkeletons' workflow.\n" swc_contents += f"# (Skeletonization time: {timer.seconds}):\n" swc_contents += "# Workflow configuration:\n" swc_contents += "# \n" # Also show which downsample factor was actually chosen config_copy = copy.deepcopy(config) config_copy["options"]["(final-downsample-factor)"] = downsample_factor config_comment = json_dumps(config_copy, sort_keys=True, indent=4, separators=(',', ': ')) config_comment = "\n".join("# " + line for line in config_comment.split("\n")) config_comment += "\n\n" swc_contents += config_comment + tree.toString() del tree return (body_id, swc_contents) # No error
def _execute_skeletonization(self, large_id_box_mask_factor_err): config = self.config_data @self.collect_log(lambda _: '_SKELETONIZATION_ERRORS') def logged_skeletonize(arg): return skeletonize_in_subprocess(config, arg) # --> (body_id, swc_contents, error_msg) body_ids_and_skeletons = large_id_box_mask_factor_err.map( logged_skeletonize) persist_and_execute(body_ids_and_skeletons, "Computing skeletons", logger) # Errors were already written to a separate file, but let's duplicate them in the master log. errors = body_ids_and_skeletons.map( lambda id_swc_err: id_swc_err[-1]).filter(bool).collect() for error in errors: logger.error(error) # Write with Timer() as timer: body_ids_and_skeletons.foreachPartition( partial(post_swcs_to_dvid, config)) logger.info(f"Writing skeletons to DVID took {timer.seconds}")
def timed_fetch_blocks_from_box(box): """ Fetch the blocks for a given box and return the time it took to fetch them. Do not bother decompressing the blocks or combining them into a single volume. """ assert not (box % block_shape).any( ), "For this test, all requests must be block-aligned" block_boxes = list(boxes_from_grid(box, Grid(block_shape))) block_coords_xyz = np.array(block_boxes)[:, 0, ::-1] // block_shape block_coords_str = ','.join(map(str, block_coords_xyz.flat)) voxel_count = np.prod(box[1] - box[0]) session = default_dvid_session() url = f'{server}/api/node/{uuid}/{instance}/specificblocks?blocks={block_coords_str}' with resource_mgr_client.access_context(server, True, 1, voxel_count): timestamp = datetime.now() with Timer() as timer: r = session.get(url) r.raise_for_status() return timestamp, voxel_count, len(r.content), timer.seconds
def writeimagepyramid(part_data): logger = logging.getLogger(__name__) part, vol = part_data offset = part.get_offset() zslice = offset.z from PIL import Image from scipy import ndimage import io s = default_dvid_session() # pad data with delimiter if needed timslice = vol[0, :, :] shiftx = offset.x % tilesize shifty = offset.y % tilesize tysize, txsize = timslice.shape ysize = tysize + shifty xsize = txsize + shiftx imslice = np.zeros((ysize, xsize)) imslice[:, :] = delimiter imslice[shifty:ysize, shiftx:xsize] = timslice curry = (offset.y - shifty) // 2 currx = (offset.x - shiftx) // 2 imlevels = [] tileoffsetyx = [] imlevels.append(imslice) tileoffsetyx.append((offset.y // tilesize, offset.x // tilesize)) with Timer() as downsample_timer: # use generic downsample algorithm for level in range(1, maxlevel + 1): tysize, txsize = imlevels[level - 1].shape shiftx = currx % tilesize shifty = curry % tilesize ysize = tysize + shifty xsize = txsize + shiftx imslice = np.zeros((ysize, xsize)) imslice[:, :] = delimiter timslice = ndimage.interpolation.zoom( imlevels[level - 1], 0.5) imslice[shifty:ysize, shiftx:xsize] = timslice imlevels.append(imslice) tileoffsetyx.append((currx // tilesize, curry // tilesize)) curry = (curry - shifty) // 2 currx = (currx - shiftx) // 2 logger.info("Downsampled {} levels in {:.3f} seconds".format( maxlevel, downsample_timer.seconds)) # write tile pyramid using custom requests for levelnum in range(0, len(imlevels)): levelslice = imlevels[levelnum] dim1, dim2 = levelslice.shape num1tiles = (dim1 - 1) // tilesize + 1 num2tiles = (dim2 - 1) // tilesize + 1 with Timer() as post_timer: for iter1 in range(0, num1tiles): for iter2 in range(0, num2tiles): # extract tile tileholder = np.zeros((tilesize, tilesize), np.uint8) tileholder[:, :] = delimiter min1 = iter1 * tilesize min2 = iter2 * tilesize tileslice = levelslice[min1:min1 + tilesize, min2:min2 + tilesize] t1, t2 = tileslice.shape tileholder[0:t1, 0:t2] = tileslice starty, startx = tileoffsetyx[levelnum] starty += iter1 startx += iter2 if createtiles: buf = BytesIO() img = Image.frombuffer('L', (tilesize, tilesize), tileholder.tostring(), 'raw', 'L', 0, 1) img.save(buf, format="png") urlreq = server + "/api/node/" + uuid + "/" + tilename + "/tile/xy/" + str( levelnum) + "/" + str(startx) + "_" + str( starty) + "_" + str(zslice) s.post(urlreq, data=buf.getvalue()) buf.close() if createtilesjpeg: buf = BytesIO() img = Image.frombuffer('L', (tilesize, tilesize), tileholder.tostring(), 'raw', 'L', 0, 1) img.save(buf, format="jpeg") urlreq = server + "/api/node/" + uuid + "/" + tilenamejpeg + "/tile/xy/" + str( levelnum) + "/" + str(startx) + "_" + str( starty) + "_" + str(zslice) s.post(urlreq, data=buf.getvalue()) buf.close() logger.info("Posted {} tiles (level={}) in {} seconds".format( num1tiles * num2tiles, levelnum, post_timer.seconds))
def write_blocks(part_vol): logger = logging.getLogger(__name__) part, data = part_vol offset = part.get_offset() reloffset = part.get_reloffset() _, _, x_size = data.shape if x_size % blksize != 0: # check if padded raise ValueError("Data is not block aligned") shiftedoffset = (offset.z + reloffset.z, offset.y + reloffset.y, offset.x + reloffset.x) logger.info("Starting WRITE of partition at: {} size: {}".format( shiftedoffset, data.shape)) node_service = retrieve_node_service(server, uuid, resource_server, resource_port, appname) # Find all non-zero blocks (and record by block index) block_coords = [] for block_index, block_x in enumerate(range(0, x_size, blksize)): if not (data[:, :, block_x:block_x + blksize] == delimiter).all(): block_coords.append( (0, 0, block_index )) # (Don't care about Z,Y indexes, just X-index) # Find *runs* of non-zero blocks block_runs = runlength_encode( block_coords, True) # returns [[Z,Y,X1,X2], [Z,Y,X1,X2], ...] # Convert stop indexes from inclusive to exclusive block_runs[:, -1] += 1 # Discard Z,Y indexes and convert from indexes to pixels ranges = blksize * block_runs[:, 2:4] # iterate through contiguous blocks and write to DVID # TODO: write compressed data directly into DVID for (data_x_start, data_x_end) in ranges: with Timer() as copy_timer: datacrop = data[:, :, data_x_start:data_x_end].copy() logger.info("Copied {}:{} in {:.3f} seconds".format( data_x_start, data_x_end, copy_timer.seconds)) data_offset_zyx = (shiftedoffset[0], shiftedoffset[1], shiftedoffset[2] + data_x_start) if dataname is not None: with Timer() as put_timer: if not israw: logger.info("STARTING Put: labels block {}".format( data_offset_zyx)) if resource_server != "" or dvid_info[ "dvid-server"].startswith( "http://127.0.0.1"): node_service.put_labels3D(dataname, datacrop, data_offset_zyx, compress=True, throttle=False) else: node_service.put_labels3D(dataname, datacrop, data_offset_zyx, compress=True) else: logger.info("STARTING Put: raw block {}".format( data_offset_zyx)) if resource_server != "" or dvid_info[ "dvid-server"].startswith( "http://127.0.0.1"): node_service.put_gray3D(dataname, datacrop, data_offset_zyx, compress=False, throttle=False) else: node_service.put_gray3D(dataname, datacrop, data_offset_zyx, compress=False) logger.info("Put block {} in {:.3f} seconds".format( data_offset_zyx, put_timer.seconds)) if dataname_lossy is not None: logger.info( "STARTING Put: lossy block {}".format(data_offset_zyx)) with Timer() as put_lossy_timer: if resource_server != "" or dvid_info[ "dvid-server"].startswith("http://127.0.0.1"): node_service.put_gray3D(dataname_lossy, datacrop, data_offset_zyx, compress=False, throttle=False) else: node_service.put_gray3D(dataname_lossy, datacrop, data_offset_zyx, compress=False) logger.info("Put lossy block {} in {:.3f} seconds".format( data_offset_zyx, put_lossy_timer.seconds))
def execute(self): self._init_services() self._sanitize_config() options = self.config_data["options"] output_service = self.output_service logger.info( f"Output bounding box: {output_service.bounding_box_zyx[:,::-1]}") # Data is processed in Z-slabs slab_depth = options["slices-per-slab"] input_bb_zyx = self.input_service.bounding_box_zyx _, slice_start_y, slice_start_x = input_bb_zyx[0] slab_shape_zyx = input_bb_zyx[1] - input_bb_zyx[0] slab_shape_zyx[0] = slab_depth slice_shape_zyx = slab_shape_zyx.copy() slice_shape_zyx[0] = 1 # This grid outlines the slabs -- each grid box is a full slab slab_grid = Grid(slab_shape_zyx, (0, slice_start_y, slice_start_x)) slab_boxes = list(clipped_boxes_from_grid(input_bb_zyx, slab_grid)) for slab_index, slab_box_zyx in enumerate(slab_boxes): # Contruct BrickWall from input bricks num_threads = num_worker_nodes() * cpus_per_worker() slab_voxels = np.prod(slab_box_zyx[1] - slab_box_zyx[0]) voxels_per_thread = slab_voxels / num_threads bricked_slab_wall = BrickWall.from_volume_service( self.input_service, 0, slab_box_zyx, self.sc, voxels_per_thread / 2) # Force download bricked_slab_wall.persist_and_execute( f"Downloading slab {slab_index}/{len(slab_boxes)}: {slab_box_zyx[:,::-1]}", logger) # Remap to slice-sized "bricks" sliced_grid = Grid(slice_shape_zyx, offset=slab_box_zyx[0]) sliced_slab_wall = bricked_slab_wall.realign_to_new_grid( sliced_grid) sliced_slab_wall.persist_and_execute( f"Assembling slab {slab_index}/{len(slab_boxes)} slices", logger) # Discard original bricks bricked_slab_wall.unpersist() del bricked_slab_wall def write_slice(brick): assert (brick.physical_box == brick.logical_box).all() output_service.write_subvolume(brick.volume, brick.physical_box[0]) # Export to PNG or TIFF, etc. (automatic via slice path extension) with Timer() as timer: logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)}", extra={ "status": f"Exporting {slab_index}/{len(slab_boxes)}" }) rt.foreach(write_slice, sliced_slab_wall.bricks) logger.info( f"Exporting slab {slab_index}/{len(slab_boxes)} took {timer.timedelta}", extra={"status": f"Done: {slab_index}/{len(slab_boxes)}"}) # Discard slice data sliced_slab_wall.unpersist() del sliced_slab_wall logger.info(f"DONE exporting {len(slab_boxes)} slabs.", extra={'status': "DONE"})
def _execute_mesh_generation(self, large_id_box_mask_factor_err): config = self.config_data @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS') def logged_generate_mesh(arg): return generate_mesh_in_subprocess(config, arg) # --> (body_id, mesh_bytes, error_msg) body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map( logged_generate_mesh) persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes", logger) # Errors were already written to a separate file, but let's duplicate them in the master log. errors = body_ids_and_meshes_with_err.map( lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect() for error in errors: logger.error(error) # Filter out error cases body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \ .map( lambda id_mesh_err: id_mesh_err[:2] ) # Group according to scheme grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] n_partitions = num_worker_nodes() * cpus_per_worker() if grouping_scheme in "hundreds": def last_six_digits(id_mesh): body_id, _mesh = id_mesh group_id = body_id - (body_id % 100) return group_id grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy( last_six_digits, numPartitions=n_partitions) elif grouping_scheme == "labelmap": import pandas as pd mapping_pairs = load_labelmap( config["mesh-config"]["storage"]["labelmap"], self.config_dir) def prepend_mapped_group_id(id_mesh_partition): df = pd.DataFrame(mapping_pairs, columns=["body_id", "group_id"]) new_partition = [] for id_mesh in id_mesh_partition: body_id, mesh = id_mesh rows = df.loc[df.body_id == body_id] if len(rows) == 0: # If missing from labelmap, # we assume an implicit identity mapping group_id = body_id else: group_id = rows['group_id'].iloc[0] new_partition.append((group_id, (body_id, mesh))) return new_partition # We do this via mapPartitions().groupByKey() instead of a simple groupBy() # to save time constructing the DataFrame inside the closure above. # (TODO: Figure out why the dataframe isn't pickling properly...) skip_groups = set(config["mesh-config"]["storage"]["skip-groups"]) grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \ .filter(lambda item: item[0] not in skip_groups) \ .groupByKey(numPartitions=n_partitions) elif grouping_scheme in ("singletons", "no-groups"): # Create 'groups' of one item each, re-using the body ID as the group id. # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.) grouped_body_ids_and_meshes = body_ids_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])])) persist_and_execute( grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger) unpersist(body_ids_and_meshes) del body_ids_and_meshes with Timer() as timer: grouped_body_ids_and_meshes.foreachPartition( partial(post_meshes_to_dvid, config)) logger.info(f"Writing meshes to DVID took {timer.seconds}")
def compute_comparison_mapping_table(old_edges, new_edges, sv_sizes=None): """ Given two agglomeration encoded via old_edges and new_edges (in which vertex IDs correspond to supervoxel IDs), compute the connected components for both graphs, and also the CC of their graph intersection. Returns the mapping from SV to body (CC) for all three graphs as a pd.DataFrame. Each body ID is defined as the minimum SV ID in the body, so of course there will be no correspondence between body IDs in the different mappings. If sv_sizes is provided, the size of each supervoxel is appended as a column in the DataFrame. Any supervoxel IDs missing from sv_sizes ("phantom" supervoxels) are presumed to be of size 0. Note: For simply comparing segmentations (regardless of internal merge topology), this function may not be what you want. Consider two 3-node graphs: A-B-C and B-C-A. Those two graphs yield identical segmentations (i.e. a single component), but their graph intersection yields two components (A and B-C). Consider using a different function, which returns a set-based result. That is, compute the CC for the 'old' graph, the CC for the 'new' graph, and then simply enumerate the unique body pairs in the resulting table. Args: old_edges: ndarray, shape (N,2) new_edges: ndarray, shape (M,2) sv_sizes: (Optional) Must be a pd.Series as returned by load_supervoxel_sizes(), i.e. sv is the index and size is the value. Returns: pd.DataFrame, indexed by sv with columns: "old_body", "new_body", "intersection_component", and "voxel_count" (if sv_sizes was provided) """ # We require C-order arrays, since we'll be fiddling with dtype views that change the shape of the arrays. # https://mail.scipy.org/pipermail/numpy-svn/2015-December/007404.html old_edges = old_edges.astype(np.uint64, order='C', copy=False) new_edges = new_edges.astype(np.uint64, order='C', copy=False) # Edges must be pre-normalized assert (old_edges[:, 0] <= old_edges[:, 1]).all() assert (new_edges[:, 0] <= new_edges[:, 1]).all() with Timer("Removing duplicate edges", logger): # Pre-sorting should speed up drop_duplicates() old_edges.view([('u', np.uint64), ('v', np.uint64)]).sort() new_edges.view([('u', np.uint64), ('v', np.uint64)]).sort() old_edges = pd.DataFrame(old_edges, copy=False).drop_duplicates().values new_edges = pd.DataFrame(new_edges, copy=False).drop_duplicates().values with Timer("Computing intersection", logger): all_edges = np.concatenate((old_edges, new_edges)) all_edges.view([('u', np.uint64), ('v', np.uint64)]).sort() duplicate_markers = pd.DataFrame(all_edges, copy=False).duplicated().values common_edges = all_edges[duplicate_markers] del all_edges with Timer("Ensuring identical SV sets", logger): old_svs = set(pd.unique(old_edges.flat)) new_svs = set(pd.unique(new_edges.flat)) common_svs = set(pd.unique(common_edges.flat)) # Append identity rows for SVs missing from either graph missing_from_old = np.fromiter(new_svs.union(common_svs) - old_svs, dtype=np.uint64) missing_from_new = np.fromiter(old_svs.union(common_svs) - new_svs, dtype=np.uint64) missing_from_common = np.fromiter(new_svs.union(old_svs) - common_svs, dtype=np.uint64) if len(missing_from_old) > 0: old_missing_edges = np.concatenate( (missing_from_old[:, None], missing_from_old[:, None]), axis=1) old_edges = np.concatenate((old_edges, old_missing_edges)) if len(missing_from_new) > 0: new_missing_edges = np.concatenate( (missing_from_new[:, None], missing_from_new[:, None]), axis=1) new_edges = np.concatenate((new_edges, new_missing_edges)) if len(missing_from_common) > 0: common_missing_edges = np.concatenate( (missing_from_common[:, None], missing_from_common[:, None]), axis=1) common_edges = np.concatenate((common_edges, common_missing_edges)) with Timer("Computing old mapping", logger): old_mapping = mapping_from_edges(old_edges, sort_by='segment', as_series=True) with Timer("Computing new mapping", logger): new_mapping = mapping_from_edges(new_edges, sort_by='segment', as_series=True) with Timer("Computing intersection mapping", logger): intersection_mapping = mapping_from_edges(common_edges, sort_by='segment', as_series=True) assert len(old_mapping.index) == len(new_mapping.index) == len( intersection_mapping.index) sv_table = pd.DataFrame( { "old_body": old_mapping, "new_body": new_mapping, "intersection_component": intersection_mapping }, copy=False) sv_table.index = sv_table.index.astype(np.uint64, copy=False) sv_table.index.name = "sv" if sv_sizes is not None: with Timer("Appending supervoxel sizes", logger): sv_table = sv_table.merge(pd.DataFrame(sv_sizes), 'left', left_index=True, right_index=True, copy=False) # Fix 'phantom' supervoxels (mentioned in the merge graph(s), but not present in the volume) sv_table['voxel_count'].fillna(0, inplace=True) sv_table['voxel_count'] = sv_table['voxel_count'].astype(np.uint64) # Force correct dtypes sv_table['old_body'] = sv_table['old_body'].astype(np.uint64) sv_table['new_body'] = sv_table['new_body'].astype(np.uint64) sv_table['intersection_component'] = sv_table[ 'intersection_component'].astype(np.uint64) return sv_table
def execute(self): import pandas as pd self._sanitize_config() config = self.config_data options = config["options"] resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"]) volume_service = VolumeService.create_from_config( config["dvid-info"], self.config_dir, resource_mgr_client) self._init_meshes_instances() # Aim for 2 GB RDD partitions GB = 2**30 target_partition_size_voxels = 2 * GB // np.uint64().nbytes # This will return None if we're not using sparse blocks sparse_block_mask = self._get_sparse_block_mask(volume_service) brick_wall = BrickWall.from_volume_service( volume_service, 0, None, self.sc, target_partition_size_voxels, sparse_block_mask) brick_wall.persist_and_execute("Downloading segmentation", logger) # brick -> [ (segment_label, (box, mask, count)), # (segment_label, (box, mask, count)), ... ] segments_and_masks = brick_wall.bricks.map( partial(compute_segment_masks, config)) persist_and_execute(segments_and_masks, "Computing brick-local segment masks", logger) brick_wall.unpersist() del brick_wall with Timer("Computing segment statistics", logger): mask_stats_df = self.compute_mask_stats(segments_and_masks) # Flatten now, AFTER stats have been computed # (compute_mask_stats() requires that the RDDs not have duplicate labels in them.) # While we're at it, drop the count (not needed any more) # --> (segment_label, (box, mask)) def drop_count(items): new_items = [] for item in items: segment_label, (box, mask, _count) = item new_items.append((segment_label, (box, mask))) return new_items segments_and_masks = segments_and_masks.flatMap(drop_count) bad_segments = mask_stats_df[[ 'segment', 'compressed_bytes' ]].query('compressed_bytes > 1.9e9')['segment'] if len(bad_segments) > 0: logger.error( f"SOME SEGMENTS (N={len(bad_segments)}) ARE TOO BIG TO PROCESS. Skipping segments: {list(bad_segments)}." ) segments_and_masks = segments_and_masks.filter( lambda seg_mask: seg_mask[0] not in bad_segments.values) # (segment, (box, mask)) # --> (segment, boxes_and_masks) # === (segment, [(box, mask), (box, mask), (box, mask), ...]) masks_by_segment_id = segments_and_masks.groupByKey() persist_and_execute(masks_by_segment_id, "Grouping segment masks by segment label ID", logger) segments_and_masks.unpersist() del segments_and_masks # Insert chosen downsample_factor (a.k.a. dsf) # --> (segment, dsf_and_boxes_and_masks) # === (segment, (downsample_factor, [(box, mask), (box, mask), (box, mask), ...])) downsample_df = pd.Series( mask_stats_df['downsample_factor']. values, # Must use '.values' here, otherwise index=mask_stats_df['segment'].values ) # index is used to read initial data. def insert_dsf(item): segment, boxes_and_masks = item downsample_factor = downsample_df[segment] return (segment, (downsample_factor, boxes_and_masks)) masks_by_segment_id = masks_by_segment_id.map(insert_dsf) ## ## Filter out small segments and/or small bodies ## keep_col = mask_stats_df['keep_segment'] & mask_stats_df['keep_body'] if not keep_col.all(): # Note: This array will be broadcasted to the workers. # It will be potentially quite large if we're keeping most (but not all) segments. # Broadcast expense should be minimal thanks to lz4 compression, # but RAM usage will be high. segments_to_keep = mask_stats_df['segment'][keep_col].values filtered_masks_by_segment_id = masks_by_segment_id.filter( lambda key_and_value: key_and_value[0] in segments_to_keep) persist_and_execute(filtered_masks_by_segment_id, "Filtering masks by segment and size", logger) del masks_by_segment_id masks_by_segment_id = filtered_masks_by_segment_id # Aggregate # --> (segment_label, (box, mask, downsample_factor)) segment_box_mask_factor = masks_by_segment_id.mapValues( partial(combine_masks, config)) persist_and_execute(segment_box_mask_factor, "Assembling masks", logger) # # Re-compute meshes once for every simplification ratio in the config # for instance_name, simplification_ratio in zip( self.mesh_instances, config["mesh-config"]["simplify-ratios"]): def _generate_mesh(box_mask_factor): box, mask, factor = box_mask_factor return generate_mesh(config, simplification_ratio, box, mask, factor) # --> (segment_label, (mesh_bytes, vertex_count)) segments_meshes_counts = segment_box_mask_factor.mapValues( _generate_mesh) persist_and_execute( segments_meshes_counts, f"Computing meshes at decimation {simplification_ratio:.2f}", logger) with Timer("Computing mesh statistics", logger): mask_and_mesh_stats_df = self.append_mesh_stats( mask_stats_df, segments_meshes_counts, f'{simplification_ratio:.2f}') # Update the 'keep_body' column: Skip meshes that are too big. huge_bodies = (mask_and_mesh_stats_df['body_mesh_bytes'] > 1.9e9) if huge_bodies.any(): logger.error( "SOME BODY MESH GROUPS ARE TOO BIG TO PROCESS. See dumped DataFrame for details." ) mask_and_mesh_stats_df['keep_body'] &= ~huge_bodies # Drop them from the processing list segments_in_huge_bodies = mask_and_mesh_stats_df['segment'][ huge_bodies].values segments_meshes_counts = segments_meshes_counts.filter( lambda seg_and_values: not (seg_and_values[0] in segments_in_huge_bodies)) # --> (segment_label, mesh_bytes) def drop_vcount(item): segment_label, (mesh_bytes, _vertex_count) = item return (segment_label, mesh_bytes) segments_and_meshes = segments_meshes_counts.map(drop_vcount) # Group by body ID # --> ( body_id ( segment_label, mesh_bytes ) ) grouped_body_ids_segments_meshes = self.group_by_body( segments_and_meshes) unpersist(segments_and_meshes) del segments_and_meshes unpersist(segments_meshes_counts) del segments_meshes_counts with Timer("Writing meshes to DVID", logger): grouped_body_ids_segments_meshes.foreachPartition( partial(post_meshes_to_dvid, config, instance_name)) unpersist(grouped_body_ids_segments_meshes) del grouped_body_ids_segments_meshes
def compute_mask_stats(self, segments_and_masks): """ segments_and_masks: RDD wher each element is of the form: (label, (box, mask, count)) AND labels within a partition are UNIQUE. """ config = self.config_data # In DataFrames, bounding box is stored as 6 int columns instead # of 1 'object' column for easier joins, combines, serialization, etc. BB_COLS = ['z0', 'y0', 'x0', 'z1', 'y1', 'x1'] STATS_COLUMNS = ['segment', 'segment_voxel_count', 'compressed_bytes' ] + BB_COLS def stats_df_for_masks(segments_and_masks): """ Convert the list of elements, each in the form: (segment, (box, compressed_mask, count)) into a pandas DataFrame. Note: This function assumes that there are no duplicate segments in the list. Therefore, it must be called only with the list of masks from a single 'brick'. """ import pandas as pd pd.set_option('expand_frame_repr', False) # Each item is (segment, (box, compressed_mask, count)) bounding_boxes = [ object_info[1][0] for object_info in segments_and_masks ] item_df = pd.DataFrame(columns=STATS_COLUMNS) item_df['segment'] = [ object_info[0] for object_info in segments_and_masks ] item_df['compressed_bytes'] = [ object_info[1][1].compressed_nbytes for object_info in segments_and_masks ] item_df['segment_voxel_count'] = [ object_info[1][2] for object_info in segments_and_masks ] item_df[BB_COLS] = np.array(bounding_boxes).reshape(-1, 6) return item_df def merge_stats(left, right): import pandas as pd pd.set_option('expand_frame_repr', False) # Join the two DFs and replace missing values with appropriate defaults joined = left.merge(right, 'outer', on='segment', suffixes=('_left', '_right'), copy=False) fillna_inplace(joined, np.inf, ['z0_left', 'y0_left', 'x0_left']) fillna_inplace(joined, np.inf, ['z0_right', 'y0_right', 'x0_right']) fillna_inplace(joined, -np.inf, ['z1_left', 'y1_left', 'x1_left']) fillna_inplace(joined, -np.inf, ['z1_right', 'y1_right', 'x1_right']) fillna_inplace( joined, 0, ['segment_voxel_count_left', 'segment_voxel_count_right']) fillna_inplace(joined, 0, ['compressed_bytes_left', 'compressed_bytes_right']) # Now that the data is aligned by segment label, combine corresponding columns result = pd.DataFrame({'segment': joined['segment']}) result['segment_voxel_count'] = joined[ 'segment_voxel_count_left'] + joined[ 'segment_voxel_count_right'] result['compressed_bytes'] = joined[ 'compressed_bytes_left'] + joined['compressed_bytes_right'] result[['z0', 'y0', 'x0']] = np.minimum( joined[['z0_left', 'y0_left', 'x0_left']], joined[['z0_right', 'y0_right', 'x0_right']]) result[['z1', 'y1', 'x1']] = np.maximum( joined[['z1_left', 'y1_left', 'x1_left']], joined[['z1_right', 'y1_right', 'x1_right']]) assert set(result.columns) == set(STATS_COLUMNS) return result # Calculate segment (a.k.a. supervoxel) stats full_stats_df = segments_and_masks.map(stats_df_for_masks).treeReduce( merge_stats, depth=4) # Convert column types (float64 was used above to handle NaNs, but now we can convert back to int) convert_dtype_inplace(full_stats_df, np.uint64, ['segment_voxel_count', 'compressed_bytes']) convert_dtype_inplace( full_stats_df, np.int64, BB_COLS ) # int32 is dangerous because multiplying them together quickly overflows full_stats_df['box_size'] = full_stats_df.eval( '(z1 - z0)*(y1 - y0)*(x1 - x0)') full_stats_df['keep_segment'] = ( full_stats_df['segment_voxel_count'] >= config['options']['minimum-segment-size']) full_stats_df['keep_segment'] &= ( full_stats_df['segment_voxel_count'] <= config['options']['maximum-segment-size']) max_analysis_voxels = config['options']['max-analysis-volume'] # Chosen dowsnsample factor is max of user's minimum and auto-minimum full_stats_df['downsample_factor'] = 1 + np.power( full_stats_df['box_size'].values / max_analysis_voxels, (1. / 3)).astype(np.int16) full_stats_df['downsample_factor'] = np.maximum( full_stats_df['downsample_factor'], config['options']['minimum-downsample-factor']) # Convert to uint8 to save RAM (will be broadcasted to workers) assert full_stats_df['downsample_factor'].max() < 256 full_stats_df['downsample_factor'] = full_stats_df[ 'downsample_factor'].astype(np.uint8) assert full_stats_df['downsample_factor'].dtype == np.uint8 ## ## If grouping segments into bodies (for tarballs), ## also append body stats ## grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] if grouping_scheme == "labelmap": import pandas as pd mapping_pairs = self.load_labelmap() # Add body column segment_to_body_df = pd.DataFrame(mapping_pairs, columns=['segment', 'body']) full_stats_df = full_stats_df.merge(segment_to_body_df, 'left', on='segment', copy=False) # Missing segments in the labelmap are assumed to be identity-mapped full_stats_df['body'].fillna(full_stats_df['segment'], inplace=True) full_stats_df['body'] = full_stats_df['body'].astype(np.uint64) # Calculate body voxel sizes body_stats_df = full_stats_df[[ 'body', 'segment_voxel_count' ]].groupby('body').agg(['size', 'sum']) body_stats_df.columns = ['body_segment_count', 'body_voxel_count'] body_stats_df['body'] = body_stats_df.index full_stats_df = full_stats_df.merge(body_stats_df, 'left', on='body', copy=False) if config["options"]["force-uniform-downsampling"]: body_downsample_factors = full_stats_df[[ 'body', 'downsample_factor' ]].groupby('body', as_index=False).max() adjusted_downsample_factors = full_stats_df[['body']].merge( body_downsample_factors, 'left', on='body') full_stats_df[ 'downsample_factor'] = adjusted_downsample_factors[ 'downsample_factor'].astype(np.uint8) # For offline analysis, write body stats to a file output_path = self.config_dir + '/body-stats.csv' logger.info(f"Saving body statistics to {output_path}") body_stats_df = body_stats_df[[ 'body', 'body_segment_count', 'body_voxel_count' ]] # Set col order body_stats_df.columns = ['body', 'segment_count', 'voxel_count'] # rename columns for csv body_stats_df.sort_values('voxel_count', ascending=False, inplace=True) body_stats_df.to_csv(output_path, header=True, index=False) else: # Not grouping -- Just duplicate segment stats into body columns full_stats_df['body'] = full_stats_df['segment'] full_stats_df['body_voxel_count'] = full_stats_df[ 'segment_voxel_count'] full_stats_df['keep_body'] = ( (full_stats_df['body_voxel_count'] >= config['options']['minimum-agglomerated-size']) & (full_stats_df['body_voxel_count'] <= config['options']['maximum-agglomerated-size'])) # If subset-bodies were given, exclude all others. sparse_body_ids = config["mesh-config"]["storage"]["subset-bodies"] if sparse_body_ids: for body_id in sparse_body_ids: if not full_stats_df[full_stats_df['body'] == body_id]['keep_body'].all(): logger.error( f"You explicitly listed body {body_id} in subset-bodies, " "but it will be excluded due to your other config settings." ) full_stats_df['keep_body'] &= full_stats_df.eval( 'body in @sparse_body_ids') # Sort for convenience of viewing output with Timer("Sorting segment stats", logger): full_stats_df.sort_values( ['body_voxel_count', 'segment_voxel_count'], ascending=False, inplace=True) #import pandas as pd #pd.set_option('expand_frame_repr', False) #logger.info(f"FULL_STATS:\n{full_stats_df}") stats_bytes = full_stats_df.memory_usage().sum() stats_gb = stats_bytes / 1e9 # Write the Stats DataFrame to a file for offline analysis. output_path = self.config_dir + '/segment-stats-dataframe.pkl.xz' logger.info( f"Saving segment statistics ({stats_gb:.3f} GB) to {output_path}") full_stats_df.to_pickle(output_path) return full_stats_df
from DVIDSparkServices.util import Timer blocks = np.random.randint(0, 2, size=(10, 64, 64, 64), dtype=bool) #blocks = np.ones((10,64,64,64), dtype=bool) for block in blocks: # Randomly select a fourth of the subblocks to be completely 1, # and one fourth to be completely 0 block_modes = np.random.randint(0, 4, size=(8, 8, 8), dtype=int) v = view_as_blocks(block, (8, 8, 8)) assert is_view_of(v, blocks) v &= (block_modes[..., None, None, None] != 0) v |= (block_modes[..., None, None, None] == 1) with Timer() as enc_timer: encoded = encode_mask_blocks(blocks) orig_bytes = 64 * 64 * 64 * len(blocks) encoded_bytes = len(encoded) print( f"Size reduction: {orig_bytes} -> {encoded_bytes} ({orig_bytes/encoded_bytes:.1f}x)" ) with Timer() as dec_timer: decoded, corners, label = decode_mask_blocks(encoded) print(f"Mask encoding seconds: {enc_timer.seconds}") print(f"Mask decoding seconds: {dec_timer.seconds}") assert (np.array(decoded) == np.array(blocks)).all()