def apply_label_mapping(bricks, mapping_pairs): """ Given an RDD of bricks (of label data) and a pre-loaded labelmap in mapping_pairs [[orig,new],[orig,new],...], apply the mapping to the bricks. bricks: RDD of Bricks containing label volumes mapping_pairs: Mapping as returned by load_labelmap. An ndarray of the form: [[orig,new], [orig,new], ... ], """ from dvidutils import LabelMapper def remap_bricks(partition_bricks): domain, codomain = mapping_pairs.transpose() mapper = LabelMapper(domain, codomain) partition_bricks = list(partition_bricks) for brick in partition_bricks: # TODO: Apparently LabelMapper can't handle non-contiguous arrays right now. # (It yields incorrect results) # Check to see if this is still a problem in the latest version of xtensor-python. brick.volume = np.asarray( brick.volume, order='C' ) mapper.apply_inplace(brick.volume, allow_unmapped=True) return partition_bricks # Use mapPartitions (instead of map) so LabelMapper can be constructed just once per partition remapped_bricks = rt.map_partitions( remap_bricks, bricks ) persist_and_execute(remapped_bricks, f"Remapping bricks", logger) return remapped_bricks
def group_by_body(self, segments_and_meshes): config = self.config_data # Group according to scheme grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] n_partitions = num_worker_nodes() * cpus_per_worker() if grouping_scheme in "hundreds": def last_six_digits( id_mesh ): body_id, _mesh = id_mesh group_id = body_id - (body_id % 100) return group_id grouped_body_ids_and_meshes = segments_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions) elif grouping_scheme == "labelmap": import pandas as pd mapping_pairs = self.load_labelmap() def prepend_mapped_group_id( id_mesh_partition ): df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] ) new_partition = [] for id_mesh in id_mesh_partition: body_id, mesh = id_mesh rows = df.loc[df.body_id == body_id] if len(rows) == 0: # If missing from labelmap, # we assume an implicit identity mapping group_id = body_id else: group_id = rows['group_id'].iloc[0] new_partition.append( (group_id, (body_id, mesh)) ) return new_partition # We do this via mapPartitions().groupByKey() instead of a simple groupBy() # to save time constructing the DataFrame inside the closure above. # (TODO: Figure out why the dataframe isn't pickling properly...) skip_groups = set(config["mesh-config"]["storage"]["skip-groups"]) grouped_body_ids_and_meshes = segments_and_meshes.mapPartitions( prepend_mapped_group_id ) \ .filter(lambda item: item[0] not in skip_groups) \ .groupByKey(numPartitions=n_partitions) elif grouping_scheme in ("singletons", "no-groups"): # Create 'groups' of one item each, re-using the body ID as the group id. # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.) grouped_body_ids_and_meshes = segments_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) ) persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger) return grouped_body_ids_and_meshes
def _execute_skeletonization(self, large_id_box_mask_factor_err): config = self.config_data @self.collect_log(lambda _: '_SKELETONIZATION_ERRORS') def logged_skeletonize(arg): return skeletonize_in_subprocess(config, arg) # --> (body_id, swc_contents, error_msg) body_ids_and_skeletons = large_id_box_mask_factor_err.map( logged_skeletonize ) persist_and_execute(body_ids_and_skeletons, "Computing skeletons", logger) # Errors were already written to a separate file, but let's duplicate them in the master log. errors = body_ids_and_skeletons.map(lambda id_swc_err: id_swc_err[-1]).filter(bool).collect() for error in errors: logger.error(error) # Write with Timer() as timer: body_ids_and_skeletons.foreachPartition( partial(post_swcs_to_dvid, config) ) logger.info(f"Writing skeletons to DVID took {timer.seconds}")
def _execute_skeletonization(self, large_id_box_mask_factor_err): config = self.config_data @self.collect_log(lambda _: '_SKELETONIZATION_ERRORS') def logged_skeletonize(arg): return skeletonize_in_subprocess(config, arg) # --> (body_id, swc_contents, error_msg) body_ids_and_skeletons = large_id_box_mask_factor_err.map( logged_skeletonize) persist_and_execute(body_ids_and_skeletons, "Computing skeletons", logger) # Errors were already written to a separate file, but let's duplicate them in the master log. errors = body_ids_and_skeletons.map( lambda id_swc_err: id_swc_err[-1]).filter(bool).collect() for error in errors: logger.error(error) # Write with Timer() as timer: body_ids_and_skeletons.foreachPartition( partial(post_swcs_to_dvid, config)) logger.info(f"Writing skeletons to DVID took {timer.seconds}")
def execute(self): import pandas as pd self._sanitize_config() config = self.config_data options = config["options"] resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"]) volume_service = VolumeService.create_from_config( config["dvid-info"], self.config_dir, resource_mgr_client) self._init_meshes_instances() # Aim for 2 GB RDD partitions GB = 2**30 target_partition_size_voxels = 2 * GB // np.uint64().nbytes # This will return None if we're not using sparse blocks sparse_block_mask = self._get_sparse_block_mask(volume_service) brick_wall = BrickWall.from_volume_service( volume_service, 0, None, self.sc, target_partition_size_voxels, sparse_block_mask) brick_wall.persist_and_execute("Downloading segmentation", logger) # brick -> [ (segment_label, (box, mask, count)), # (segment_label, (box, mask, count)), ... ] segments_and_masks = brick_wall.bricks.map( partial(compute_segment_masks, config)) persist_and_execute(segments_and_masks, "Computing brick-local segment masks", logger) brick_wall.unpersist() del brick_wall with Timer("Computing segment statistics", logger): mask_stats_df = self.compute_mask_stats(segments_and_masks) # Flatten now, AFTER stats have been computed # (compute_mask_stats() requires that the RDDs not have duplicate labels in them.) # While we're at it, drop the count (not needed any more) # --> (segment_label, (box, mask)) def drop_count(items): new_items = [] for item in items: segment_label, (box, mask, _count) = item new_items.append((segment_label, (box, mask))) return new_items segments_and_masks = segments_and_masks.flatMap(drop_count) bad_segments = mask_stats_df[[ 'segment', 'compressed_bytes' ]].query('compressed_bytes > 1.9e9')['segment'] if len(bad_segments) > 0: logger.error( f"SOME SEGMENTS (N={len(bad_segments)}) ARE TOO BIG TO PROCESS. Skipping segments: {list(bad_segments)}." ) segments_and_masks = segments_and_masks.filter( lambda seg_mask: seg_mask[0] not in bad_segments.values) # (segment, (box, mask)) # --> (segment, boxes_and_masks) # === (segment, [(box, mask), (box, mask), (box, mask), ...]) masks_by_segment_id = segments_and_masks.groupByKey() persist_and_execute(masks_by_segment_id, "Grouping segment masks by segment label ID", logger) segments_and_masks.unpersist() del segments_and_masks # Insert chosen downsample_factor (a.k.a. dsf) # --> (segment, dsf_and_boxes_and_masks) # === (segment, (downsample_factor, [(box, mask), (box, mask), (box, mask), ...])) downsample_df = pd.Series( mask_stats_df['downsample_factor']. values, # Must use '.values' here, otherwise index=mask_stats_df['segment'].values ) # index is used to read initial data. def insert_dsf(item): segment, boxes_and_masks = item downsample_factor = downsample_df[segment] return (segment, (downsample_factor, boxes_and_masks)) masks_by_segment_id = masks_by_segment_id.map(insert_dsf) ## ## Filter out small segments and/or small bodies ## keep_col = mask_stats_df['keep_segment'] & mask_stats_df['keep_body'] if not keep_col.all(): # Note: This array will be broadcasted to the workers. # It will be potentially quite large if we're keeping most (but not all) segments. # Broadcast expense should be minimal thanks to lz4 compression, # but RAM usage will be high. segments_to_keep = mask_stats_df['segment'][keep_col].values filtered_masks_by_segment_id = masks_by_segment_id.filter( lambda key_and_value: key_and_value[0] in segments_to_keep) persist_and_execute(filtered_masks_by_segment_id, "Filtering masks by segment and size", logger) del masks_by_segment_id masks_by_segment_id = filtered_masks_by_segment_id # Aggregate # --> (segment_label, (box, mask, downsample_factor)) segment_box_mask_factor = masks_by_segment_id.mapValues( partial(combine_masks, config)) persist_and_execute(segment_box_mask_factor, "Assembling masks", logger) # # Re-compute meshes once for every simplification ratio in the config # for instance_name, simplification_ratio in zip( self.mesh_instances, config["mesh-config"]["simplify-ratios"]): def _generate_mesh(box_mask_factor): box, mask, factor = box_mask_factor return generate_mesh(config, simplification_ratio, box, mask, factor) # --> (segment_label, (mesh_bytes, vertex_count)) segments_meshes_counts = segment_box_mask_factor.mapValues( _generate_mesh) persist_and_execute( segments_meshes_counts, f"Computing meshes at decimation {simplification_ratio:.2f}", logger) with Timer("Computing mesh statistics", logger): mask_and_mesh_stats_df = self.append_mesh_stats( mask_stats_df, segments_meshes_counts, f'{simplification_ratio:.2f}') # Update the 'keep_body' column: Skip meshes that are too big. huge_bodies = (mask_and_mesh_stats_df['body_mesh_bytes'] > 1.9e9) if huge_bodies.any(): logger.error( "SOME BODY MESH GROUPS ARE TOO BIG TO PROCESS. See dumped DataFrame for details." ) mask_and_mesh_stats_df['keep_body'] &= ~huge_bodies # Drop them from the processing list segments_in_huge_bodies = mask_and_mesh_stats_df['segment'][ huge_bodies].values segments_meshes_counts = segments_meshes_counts.filter( lambda seg_and_values: not (seg_and_values[0] in segments_in_huge_bodies)) # --> (segment_label, mesh_bytes) def drop_vcount(item): segment_label, (mesh_bytes, _vertex_count) = item return (segment_label, mesh_bytes) segments_and_meshes = segments_meshes_counts.map(drop_vcount) # Group by body ID # --> ( body_id ( segment_label, mesh_bytes ) ) grouped_body_ids_segments_meshes = self.group_by_body( segments_and_meshes) unpersist(segments_and_meshes) del segments_and_meshes unpersist(segments_meshes_counts) del segments_meshes_counts with Timer("Writing meshes to DVID", logger): grouped_body_ids_segments_meshes.foreachPartition( partial(post_meshes_to_dvid, config, instance_name)) unpersist(grouped_body_ids_segments_meshes) del grouped_body_ids_segments_meshes
def execute(self): import pandas as pd self._sanitize_config() config = self.config_data options = config["options"] resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"]) volume_service = VolumeService.create_from_config(config["dvid-info"], self.config_dir, resource_mgr_client) self._init_meshes_instances() # Aim for 2 GB RDD partitions GB = 2**30 target_partition_size_voxels = 2 * GB // np.uint64().nbytes # This will return None if we're not using sparse blocks sparse_block_mask = self._get_sparse_block_mask(volume_service) brick_wall = BrickWall.from_volume_service(volume_service, 0, None, self.sc, target_partition_size_voxels, sparse_block_mask) brick_wall.persist_and_execute("Downloading segmentation", logger) # brick -> [ (segment_label, (box, mask, count)), # (segment_label, (box, mask, count)), ... ] segments_and_masks = brick_wall.bricks.map( partial(compute_segment_masks, config) ) persist_and_execute(segments_and_masks, "Computing brick-local segment masks", logger) brick_wall.unpersist() del brick_wall with Timer("Computing segment statistics", logger): mask_stats_df = self.compute_mask_stats(segments_and_masks) # Flatten now, AFTER stats have been computed # (compute_mask_stats() requires that the RDDs not have duplicate labels in them.) # While we're at it, drop the count (not needed any more) # --> (segment_label, (box, mask)) def drop_count(items): new_items = [] for item in items: segment_label, (box, mask, _count) = item new_items.append( (segment_label, (box, mask)) ) return new_items segments_and_masks = segments_and_masks.flatMap( drop_count ) bad_segments = mask_stats_df[['segment', 'compressed_bytes']].query('compressed_bytes > 1.9e9')['segment'] if len(bad_segments) > 0: logger.error(f"SOME SEGMENTS (N={len(bad_segments)}) ARE TOO BIG TO PROCESS. Skipping segments: {list(bad_segments)}.") segments_and_masks = segments_and_masks.filter( lambda seg_mask: seg_mask[0] not in bad_segments.values ) # (segment, (box, mask)) # --> (segment, boxes_and_masks) # === (segment, [(box, mask), (box, mask), (box, mask), ...]) masks_by_segment_id = segments_and_masks.groupByKey() persist_and_execute(masks_by_segment_id, "Grouping segment masks by segment label ID", logger) segments_and_masks.unpersist() del segments_and_masks # Insert chosen downsample_factor (a.k.a. dsf) # --> (segment, dsf_and_boxes_and_masks) # === (segment, (downsample_factor, [(box, mask), (box, mask), (box, mask), ...])) downsample_df = pd.Series( mask_stats_df['downsample_factor'].values, # Must use '.values' here, otherwise index=mask_stats_df['segment'].values ) # index is used to read initial data. def insert_dsf(item): segment, boxes_and_masks = item downsample_factor = downsample_df[segment] return (segment, (downsample_factor, boxes_and_masks)) masks_by_segment_id = masks_by_segment_id.map( insert_dsf ) ## ## Filter out small segments and/or small bodies ## keep_col = mask_stats_df['keep_segment'] & mask_stats_df['keep_body'] if not keep_col.all(): # Note: This array will be broadcasted to the workers. # It will be potentially quite large if we're keeping most (but not all) segments. # Broadcast expense should be minimal thanks to lz4 compression, # but RAM usage will be high. segments_to_keep = mask_stats_df['segment'][keep_col].values filtered_masks_by_segment_id = masks_by_segment_id.filter( lambda key_and_value: key_and_value[0] in segments_to_keep ) persist_and_execute(filtered_masks_by_segment_id, "Filtering masks by segment and size", logger) del masks_by_segment_id masks_by_segment_id = filtered_masks_by_segment_id # Aggregate # --> (segment_label, (box, mask, downsample_factor)) segment_box_mask_factor = masks_by_segment_id.mapValues( partial(combine_masks, config) ) persist_and_execute(segment_box_mask_factor, "Assembling masks", logger) # # Re-compute meshes once for every simplification ratio in the config # for instance_name, simplification_ratio in zip(self.mesh_instances, config["mesh-config"]["simplify-ratios"]): def _generate_mesh(box_mask_factor): box, mask, factor = box_mask_factor return generate_mesh(config, simplification_ratio, box, mask, factor) # --> (segment_label, (mesh_bytes, vertex_count)) segments_meshes_counts = segment_box_mask_factor.mapValues( _generate_mesh ) persist_and_execute(segments_meshes_counts, f"Computing meshes at decimation {simplification_ratio:.2f}", logger) with Timer("Computing mesh statistics", logger): mask_and_mesh_stats_df = self.append_mesh_stats( mask_stats_df, segments_meshes_counts, f'{simplification_ratio:.2f}' ) # Update the 'keep_body' column: Skip meshes that are too big. huge_bodies = (mask_and_mesh_stats_df['body_mesh_bytes'] > 1.9e9) if huge_bodies.any(): logger.error("SOME BODY MESH GROUPS ARE TOO BIG TO PROCESS. See dumped DataFrame for details.") mask_and_mesh_stats_df['keep_body'] &= ~huge_bodies # Drop them from the processing list segments_in_huge_bodies = mask_and_mesh_stats_df['segment'][huge_bodies].values segments_meshes_counts = segments_meshes_counts.filter(lambda seg_and_values: not (seg_and_values[0] in segments_in_huge_bodies)) # --> (segment_label, mesh_bytes) def drop_vcount(item): segment_label, (mesh_bytes, _vertex_count) = item return (segment_label, mesh_bytes) segments_and_meshes = segments_meshes_counts.map(drop_vcount) # Group by body ID # --> ( body_id ( segment_label, mesh_bytes ) ) grouped_body_ids_segments_meshes = self.group_by_body(segments_and_meshes) unpersist(segments_and_meshes) del segments_and_meshes unpersist(segments_meshes_counts) del segments_meshes_counts with Timer("Writing meshes to DVID", logger): grouped_body_ids_segments_meshes.foreachPartition( partial(post_meshes_to_dvid, config, instance_name) ) unpersist(grouped_body_ids_segments_meshes) del grouped_body_ids_segments_meshes
def _execute_mesh_generation(self, large_id_box_mask_factor_err): config = self.config_data @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS') def logged_generate_mesh(arg): return generate_mesh_in_subprocess(config, arg) # --> (body_id, mesh_bytes, error_msg) body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map( logged_generate_mesh ) persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes", logger) # Errors were already written to a separate file, but let's duplicate them in the master log. errors = body_ids_and_meshes_with_err.map(lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect() for error in errors: logger.error(error) # Filter out error cases body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \ .map( lambda id_mesh_err: id_mesh_err[:2] ) # Group according to scheme grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] n_partitions = num_worker_nodes() * cpus_per_worker() if grouping_scheme in "hundreds": def last_six_digits( id_mesh ): body_id, _mesh = id_mesh group_id = body_id - (body_id % 100) return group_id grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions) elif grouping_scheme == "labelmap": import pandas as pd mapping_pairs = load_labelmap( config["mesh-config"]["storage"]["labelmap"], self.config_dir ) def prepend_mapped_group_id( id_mesh_partition ): df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] ) new_partition = [] for id_mesh in id_mesh_partition: body_id, mesh = id_mesh rows = df.loc[df.body_id == body_id] if len(rows) == 0: # If missing from labelmap, # we assume an implicit identity mapping group_id = body_id else: group_id = rows['group_id'].iloc[0] new_partition.append( (group_id, (body_id, mesh)) ) return new_partition # We do this via mapPartitions().groupByKey() instead of a simple groupBy() # to save time constructing the DataFrame inside the closure above. # (TODO: Figure out why the dataframe isn't pickling properly...) skip_groups = set(config["mesh-config"]["storage"]["skip-groups"]) grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \ .filter(lambda item: item[0] not in skip_groups) \ .groupByKey(numPartitions=n_partitions) elif grouping_scheme in ("singletons", "no-groups"): # Create 'groups' of one item each, re-using the body ID as the group id. # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.) grouped_body_ids_and_meshes = body_ids_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) ) persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger) unpersist(body_ids_and_meshes) del body_ids_and_meshes with Timer() as timer: grouped_body_ids_and_meshes.foreachPartition( partial(post_meshes_to_dvid, config) ) logger.info(f"Writing meshes to DVID took {timer.seconds}")
def execute(self): self._sanitize_config() config = self.config_data options = config["options"] resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"]) volume_service = VolumeService.create_from_config(config["dvid-info"], self.config_dir, resource_mgr_client) self._init_skeletons_instance() # Aim for 2 GB RDD partitions GB = 2**30 target_partition_size_voxels = 2 * GB // np.uint64().nbytes brick_wall = BrickWall.from_volume_service(volume_service, 0, None, self.sc, target_partition_size_voxels) brick_wall.persist_and_execute("Downloading segmentation", logger) # brick -> (body_id, (box, mask, count)) body_ids_and_masks = brick_wall.bricks.flatMap( partial(body_masks, config) ) persist_and_execute(body_ids_and_masks, "Computing brick-local masks", logger) brick_wall.unpersist() del brick_wall # In the case of catastrophic merges, some bodies may be too big to handle. # Skeletonizing them would probably time out anyway. bad_bodies = self.list_unmanageable_bodies(body_ids_and_masks) body_ids_and_masks = body_ids_and_masks.filter(lambda k_v: k_v[0] not in bad_bodies) # (body_id, (box, mask, count)) # --> (body_id, [(box, mask, count), (box, mask, count), (box, mask, count), ...]) grouped_body_ids_and_masks = body_ids_and_masks.groupByKey() persist_and_execute(grouped_body_ids_and_masks, "Grouping masks by body id", logger) body_ids_and_masks.unpersist() del body_ids_and_masks # (Same RDD contents, but without small bodies) grouped_large_body_ids_and_masks = grouped_body_ids_and_masks.filter( partial(is_combined_object_large_enough, config) ) persist_and_execute(grouped_large_body_ids_and_masks, "Filtering masks by size", logger) grouped_body_ids_and_masks.unpersist() del grouped_body_ids_and_masks @self.collect_log(lambda _: '_AGGREGATION_ERRORS') def logged_combine(arg): return combine_masks_in_subprocess(config, arg) # --> (body_id, combined_box, mask, downsample_factor) id_box_mask_factor_err = grouped_large_body_ids_and_masks.map( logged_combine ) persist_and_execute(id_box_mask_factor_err, "Downsampling and aggregating masks", logger) grouped_large_body_ids_and_masks.unpersist() del grouped_large_body_ids_and_masks # Errors were already written to a separate file, but let's duplicate them in the master log. errors = id_box_mask_factor_err.map(lambda i_b_m_f_e: i_b_m_f_e[-1]).filter(bool).collect() for error in errors: logger.error(error) # Small bodies (or those with errors) were not processed, # and 'None' was returned instead of a mask. Remove them. def mask_is_not_none(i_b_m_f_e): _body_id, _combined_box, combined_mask, _downsample_factor, _error_msg = i_b_m_f_e return combined_mask is not None large_id_box_mask_factor_err = id_box_mask_factor_err.filter( mask_is_not_none ) if "neutube-skeleton" in config["options"]["output-types"]: self._execute_skeletonization(large_id_box_mask_factor_err) if "mesh" in config["options"]["output-types"]: self._execute_mesh_generation(large_id_box_mask_factor_err)
def _execute_mesh_generation(self, large_id_box_mask_factor_err): config = self.config_data @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS') def logged_generate_mesh(arg): return generate_mesh_in_subprocess(config, arg) # --> (body_id, mesh_bytes, error_msg) body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map( logged_generate_mesh) persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes", logger) # Errors were already written to a separate file, but let's duplicate them in the master log. errors = body_ids_and_meshes_with_err.map( lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect() for error in errors: logger.error(error) # Filter out error cases body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \ .map( lambda id_mesh_err: id_mesh_err[:2] ) # Group according to scheme grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"] n_partitions = num_worker_nodes() * cpus_per_worker() if grouping_scheme in "hundreds": def last_six_digits(id_mesh): body_id, _mesh = id_mesh group_id = body_id - (body_id % 100) return group_id grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy( last_six_digits, numPartitions=n_partitions) elif grouping_scheme == "labelmap": import pandas as pd mapping_pairs = load_labelmap( config["mesh-config"]["storage"]["labelmap"], self.config_dir) def prepend_mapped_group_id(id_mesh_partition): df = pd.DataFrame(mapping_pairs, columns=["body_id", "group_id"]) new_partition = [] for id_mesh in id_mesh_partition: body_id, mesh = id_mesh rows = df.loc[df.body_id == body_id] if len(rows) == 0: # If missing from labelmap, # we assume an implicit identity mapping group_id = body_id else: group_id = rows['group_id'].iloc[0] new_partition.append((group_id, (body_id, mesh))) return new_partition # We do this via mapPartitions().groupByKey() instead of a simple groupBy() # to save time constructing the DataFrame inside the closure above. # (TODO: Figure out why the dataframe isn't pickling properly...) skip_groups = set(config["mesh-config"]["storage"]["skip-groups"]) grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \ .filter(lambda item: item[0] not in skip_groups) \ .groupByKey(numPartitions=n_partitions) elif grouping_scheme in ("singletons", "no-groups"): # Create 'groups' of one item each, re-using the body ID as the group id. # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.) grouped_body_ids_and_meshes = body_ids_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])])) persist_and_execute( grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger) unpersist(body_ids_and_meshes) del body_ids_and_meshes with Timer() as timer: grouped_body_ids_and_meshes.foreachPartition( partial(post_meshes_to_dvid, config)) logger.info(f"Writing meshes to DVID took {timer.seconds}")
def execute(self): self._sanitize_config() config = self.config_data options = config["options"] resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"]) volume_service = VolumeService.create_from_config( config["dvid-info"], self.config_dir, resource_mgr_client) self._init_skeletons_instance() # Aim for 2 GB RDD partitions GB = 2**30 target_partition_size_voxels = 2 * GB // np.uint64().nbytes brick_wall = BrickWall.from_volume_service( volume_service, 0, None, self.sc, target_partition_size_voxels) brick_wall.persist_and_execute("Downloading segmentation", logger) # brick -> (body_id, (box, mask, count)) body_ids_and_masks = brick_wall.bricks.flatMap( partial(body_masks, config)) persist_and_execute(body_ids_and_masks, "Computing brick-local masks", logger) brick_wall.unpersist() del brick_wall # In the case of catastrophic merges, some bodies may be too big to handle. # Skeletonizing them would probably time out anyway. bad_bodies = self.list_unmanageable_bodies(body_ids_and_masks) body_ids_and_masks = body_ids_and_masks.filter( lambda k_v: k_v[0] not in bad_bodies) # (body_id, (box, mask, count)) # --> (body_id, [(box, mask, count), (box, mask, count), (box, mask, count), ...]) grouped_body_ids_and_masks = body_ids_and_masks.groupByKey() persist_and_execute(grouped_body_ids_and_masks, "Grouping masks by body id", logger) body_ids_and_masks.unpersist() del body_ids_and_masks # (Same RDD contents, but without small bodies) grouped_large_body_ids_and_masks = grouped_body_ids_and_masks.filter( partial(is_combined_object_large_enough, config)) persist_and_execute(grouped_large_body_ids_and_masks, "Filtering masks by size", logger) grouped_body_ids_and_masks.unpersist() del grouped_body_ids_and_masks @self.collect_log(lambda _: '_AGGREGATION_ERRORS') def logged_combine(arg): return combine_masks_in_subprocess(config, arg) # --> (body_id, combined_box, mask, downsample_factor) id_box_mask_factor_err = grouped_large_body_ids_and_masks.map( logged_combine) persist_and_execute(id_box_mask_factor_err, "Downsampling and aggregating masks", logger) grouped_large_body_ids_and_masks.unpersist() del grouped_large_body_ids_and_masks # Errors were already written to a separate file, but let's duplicate them in the master log. errors = id_box_mask_factor_err.map( lambda i_b_m_f_e: i_b_m_f_e[-1]).filter(bool).collect() for error in errors: logger.error(error) # Small bodies (or those with errors) were not processed, # and 'None' was returned instead of a mask. Remove them. def mask_is_not_none(i_b_m_f_e): _body_id, _combined_box, combined_mask, _downsample_factor, _error_msg = i_b_m_f_e return combined_mask is not None large_id_box_mask_factor_err = id_box_mask_factor_err.filter( mask_is_not_none) if "neutube-skeleton" in config["options"]["output-types"]: self._execute_skeletonization(large_id_box_mask_factor_err) if "mesh" in config["options"]["output-types"]: self._execute_mesh_generation(large_id_box_mask_factor_err)
def execute(self): self._sanitize_config() # hard coding block size that is 64 # (TODO: make dynamic) BLKSIZE = 64 input_config = self.config_data["input"] options = self.config_data["options"] input_bb_zyx = np.array(input_config["geometry"]["bounding-box"])[:,::-1] input_bricks, bounding_box, _input_grid = self._partition_input() persist_and_execute(input_bricks, f"Reading entire volume", logger) # find the blocks for each body def extractBodyBlockIds(brick): """Determined blocks that intersect each body. FlatMap: brick -> (bodyid, [blockids]) """ vol = brick.volume offsetzyx = brick.physical_box[0] zsz, ysz, xsz = vol.shape assert zsz == BLKSIZE assert ysz == BLKSIZE assert xsz % BLKSIZE == 0 zid = offsetzyx[0] // BLKSIZE yid = offsetzyx[1] // BLKSIZE xid = offsetzyx[2] // BLKSIZE bodymappings = {} for blockspot in range(0, xsz, BLKSIZE): bodyids = np.unique(vol[:,:,blockspot:(blockspot+BLKSIZE)]) for bodyid in bodyids: if bodyid == 0: # ignore background bodies continue if bodyid not in bodymappings: bodymappings[bodyid] = [] bodymappings[bodyid].append((zid, yid, xid)) xid += 1 res = [] for bodyid, mappings in bodymappings.items(): res.append((bodyid, mappings)) return res allbodies = input_bricks.flatMap(extractBodyBlockIds) del input_bricks # combine body information across RDD def combineBodyInfo(part1, part2): part1.extend(part2) return part1 allbodies = allbodies.reduceByKey(combineBodyInfo) allbodies.persist() # get global list globalbodylist = allbodies.map(lambda x: x[0]).collect() globalbodylist.sort() # group sorted bodies BODYLIMIT = 1000 def findorder(bodyblocks): body, blocks = bodyblocks index = globalbodylist.index(body) // BODYLIMIT return (index, [(body, blocks)]) allbodies_index = allbodies.map(findorder) def orderbodies(b1, b2): b1.extend(b2) return b1 allbodies_sorted = allbodies_index.reduceByKey(orderbodies) # TODO extract indices in separate step to measure fetch time # fetch indices for provided block and produce list of [body, bad ids] server = input_config["dvid"]["server"] uuid = input_config["dvid"]["uuid"] resource_server = self.resource_server resource_port = self.resource_port labelname = input_config["dvid"]["segmentation-name"] appname = self.APPNAME def findindexerrors(bodies): index, bodylist = bodies bodymappings = {} rangequery = [] for (body, bids) in bodylist: bodymappings[body] = bids rangequery.append(body) # call block index DVID API from libdvid import ConnectionMethod rangequery.sort() b1 = rangequery[0] b2 = rangequery[-1] ns = retrieve_node_service(server, uuid, resource_server, resource_port, appname) addr = str(labelname + "/sparsevols-coarse/" + str(b1) + "/" + str(b2)) res = ns.custom_request(addr, None, ConnectionMethod.GET) bodyblockrle = np.fromstring(res, dtype=np.int32) currindex = 0 bodymappingsdvid = {} while currindex < len(bodyblockrle): # retrieve bodies hb = bodyblockrle[currindex] lb = bodyblockrle[currindex+1] currbody = hb | lb << 32 currindex += 2 # retrieve runlengths numspans = bodyblockrle[currindex] currindex += 1 blockarray = [] for index in range(numspans): dimx = bodyblockrle[currindex] currindex += 1 dimy = bodyblockrle[currindex] currindex += 1 dimz = bodyblockrle[currindex] currindex += 1 runx = bodyblockrle[currindex] currindex += 1 # create body mappings for xblock in range(dimx, dimx+runx): blockarray.append((dimz, dimy, xblock)) bodymappingsdvid[currbody] = blockarray allerrors = [] # find differences for body, blocklist in bodymappings.items(): if body not in bodymappingsdvid: allerrors.append([True, body, blocklist]) continue # false negatives bset = set(blocklist) bsetdvid = set(bodymappingsdvid[body]) errors = list(bset - bsetdvid) if len(errors) > 0: allerrors.append([True, body, errors]) # false positives errors2 = list(bsetdvid - bset) if len(errors2) > 0: allerrors.append([False, body, errors2]) return allerrors badindices = allbodies_sorted.flatMap(findindexerrors) # report errors allerrors = badindices.collect() # TODO provide link locations for bad bodies #self._log_neuroglancer_links() errorjson = [] for bodyerror in allerrors: errorjson.append(bodyerror) fout = open(self.config_data["output"], 'w') fout.write(json.dumps(errorjson, indent=2, cls=NumpyConvertingEncoder)) logger.info(f"DONE analyzing segmentation.")