def test_iter_batches(): data = range(10) # If data supports len, we support it, and if not, we don't. assert hasattr(iter_batches(data, 3), '__len__') assert not hasattr(iter_batches(iter(data), 3), '__len__') assert len(iter_batches(data, 3)) == 4 assert [*iter_batches(data, 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] assert [*iter_batches(iter(data), 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] data = list(data) assert [*iter_batches(data, 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] data = np.array(data) assert [a.tolist() for a in iter_batches(data, 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] data = pd.Series(data) assert [a.tolist() for a in iter_batches(data, 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] data = pd.DataFrame(data, columns=['a']) assert [df['a'].tolist() for df in iter_batches(data, 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
def append_synapse_columns(self, body_table, neuprint_info): server, dataset = neuprint_info["server"], neuprint_info["dataset"] if not server or not dataset: return body_table from neuprint import Client, default_client as neuprint_default_client, fetch_neurons, NeuronCriteria as NC @auto_retry(5, pause_between_tries=3.0, logging_name=__name__) def fetch_synapse_counts(bodies): try: c = neuprint_default_client() except Exception: c = Client(server, dataset) ndf, cdf = fetch_neurons(NC(bodyId=bodies, label='Segment'), client=c) return ndf.set_index('bodyId')[['pre', 'post']].rename_axis('body') bag = db.from_sequence(iter_batches(body_table.index.values, 1000), npartitions=16) sc_dfs = bag.map(fetch_synapse_counts).compute() sc_df = pd.concat(sc_dfs) body_table = body_table.merge(sc_df, 'left', on='body') body_table['pre'] = body_table['pre'].fillna(0.0).astype(int) body_table['post'] = body_table['post'].fillna(0.0).astype(int) return body_table
def edges_to_assignments(df, gray_source, seg_source, sv_as_body=False, batch_size=100, output_path=None, *, shuffle=False, description=""): if isinstance(df, str): df = pd.read_csv(df) assert isinstance(df, pd.DataFrame) dupes = df.duplicated(['sv_a', 'sv_b']).sum() if dupes: print(f"Dropping {dupes} duplicate tasks!") df = df.drop_duplicates(['sv_a', 'sv_b']) print(f"Writing {len(df)} tasks") if shuffle: print("Shuffling task order") df = df.sample(frac=1) os.makedirs(os.path.dirname(output_path), exist_ok=True) assignments = [] for i, batch_df in enumerate(iter_batches(df, batch_size)): if output_path: base, _ = os.path.splitext(output_path) batch_path = f"{base}-{i:03d}.json" else: batch_path = None a = edges_to_assignment(batch_df, gray_source, seg_source, sv_as_body, batch_path, description=description) assignments.append(a)
def _execute_scale(self, scale, starting_batch, mask_s5, mask_box_s5): options = self.config["masksegmentation"] block_width = self.output_service.block_width def scale_box(box, scale): # Scale down, then round up to the nearest multiple of the block width box = np.ceil(box / 2**scale).astype(np.int32) return round_box(box, block_width) # bounding box of the segmentation at the current scale. bounding_box = scale_box(self.input_service.bounding_box_zyx, scale) # Don't make bricks that are wider than the bounding box at this scale brick_shape = np.minimum(self.input_service.preferred_message_shape, bounding_box[1]) assert not (brick_shape % block_width).any() brick_boxes = boxes_from_grid(bounding_box, brick_shape, clipped=True) with Timer(f"Scale {scale}: Preparing bricks", logger): boxes_and_masks = [] for box in brick_boxes: mask_block_box = ((box // 2**(5 - scale)) - mask_box_s5[0]) mask_block_box = mask_block_box.astype( np.int32) # necessary when scale is > 5 mask_block_s5 = np.zeros(box_shape(mask_block_box), bool) mask_block_s5 = extract_subvol(mask_s5, mask_block_box) if mask_block_s5.any(): boxes_and_masks.append((box, mask_block_s5)) batches = [*iter_batches(boxes_and_masks, options["batch-size"])] if starting_batch == 0: logger.info(f"Scale {scale}: Processing {len(batches)} batches") else: logger.info( f"Scale {scale}: Processing {len(batches) - starting_batch} " f"remaining batches from {len(batches)} original batches") assert starting_batch < len(batches), \ f"Can't start at batch {starting_batch}; there are only {len(batches)} in total." batches = batches[starting_batch:] for batch_index, batch_boxes_and_masks in enumerate( batches, start=starting_batch): with Timer(f"Scale {scale}: Batch {batch_index:02d}", logger): self._execute_batch(scale, batch_index, batch_boxes_and_masks)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--split-into-batches', type=int, help='If given, also split the body stats into this many batches of roughly equal size') parser.add_argument('server') parser.add_argument('src_uuid') parser.add_argument('labelmap_instance') parser.add_argument('supervoxel_block_stats_h5', help=f'An HDF5 file with a single dataset "stats", with dtype: {STATS_DTYPE[1:]} (Note: No column for body_id)') args = parser.parse_args() configure_default_logging() initialize_excepthook() (block_sv_stats, _presorted_by, _agglo_path) = load_stats_h5_to_records(args.supervoxel_block_stats_h5) src_info = (args.server, args.src_uuid, args.labelmap_instance) mapping = fetch_mappings(*src_info) assert isinstance(mapping, pd.Series) mapping_df = mapping.reset_index().rename(columns={'sv': 'segment_id', 'body': 'body_id'}) # sorts in-place, and saves a copy to hdf5 sort_block_stats( block_sv_stats, mapping_df, args.supervoxel_block_stats_h5[:-3] + '-sorted-by-body.h5', '<fetched-from-dvid>') if args.split_into_batches: num_batches = args.split_into_batches batch_size = int(np.ceil(len(block_sv_stats) / args.split_into_batches)) logger.info(f"Splitting into {args.split_into_batches} batches of size ~{batch_size}") os.makedirs('stats-batches', exist_ok=True) body_spans = groupby_spans_presorted(block_sv_stats['body_id'][:, None]) for batch_index, batch_spans in enumerate(tqdm_proxy(iter_batches(body_spans, batch_size))): span_start, span_stop = batch_spans[0][0], batch_spans[-1][1] batch_stats = block_sv_stats[span_start:span_stop] digits = int(np.ceil(np.log10(num_batches))) batch_path = ('stats-batches/stats-batch-{:0' + str(digits) + 'd}.h5').format(batch_index) save_stats(batch_stats, batch_path) logger.info("DONE sorting stats by body")
def execute(self): options = self.config["maskedcopy"] input_service, mask_service, output_service = self.init_services() def _masked_copy(box): seg_vol = input_service.get_subvolume(box) mask_vol = mask_service.get_subvolume(box).astype(bool) seg_vol[~mask_vol] = 0 output_service.write_subvolume(seg_vol, box[0]) return (*box[0], mask_vol.sum()) # Boxes are determined by the left volume/labels/roi boxes = self.init_boxes(input_service, options["roi"]) batches = iter_batches(boxes, options["batch-size"]) logger.info(f"Performing masked copy of {len(boxes)} bricks in total.") logger.info( f"Processing {len(batches)} batches of {options['batch-size']} bricks each." ) os.makedirs('mask-stats', exist_ok=True) for batch_index, batch_boxes in enumerate(batches): if batch_index < options["restart-at-batch"]: logger.info(f"Batch {batch_index}: Skipping") continue with Timer(f"Batch {batch_index}: Copying", logger): # Aim for 4 partitions per worker total_cores = sum(self.client.ncores().values()) brick_counts = (db.from_sequence( batch_boxes, npartitions=4 * total_cores).map(_masked_copy).compute()) brick_counts_df = pd.DataFrame(brick_counts, columns=[*'zyx', 'mask_voxels']) brick_counts_df.to_csv( f'mask-stats/batch-{batch_index:03d}-brick-mask-voxels.csv', header=True, index=False)
def main(): RESULTS_PKL_PATH = sys.argv[1] if len(sys.argv) == 3: PROCESSES = int(sys.argv[2]) else: PROCESSES = 4 # Calculate the difference in resolution between the stored mito segmentation and neuron segmenation. # If they differ, it must be by a power of 2. mito_res = fetch_info(*MITO_SEG)["Extended"]["VoxelSize"][0] assert mito_res % NEIGHBORHOOD_RES == 0 assert np.log2(mito_res / NEIGHBORHOOD_RES) == int(np.log2(mito_res / NEIGHBORHOOD_RES)), \ "This script assumes that the mito resolution and neighborhood resolution differ by a power of 2." mito_res_scale_diff = int(np.log2(mito_res // NEIGHBORHOOD_RES)) with open(RESULTS_PKL_PATH, 'rb') as f: mc_df = pickle.load(f) new_names = {col: col.replace(' ', '_') for col in mc_df.columns} new_names['result'] = 'proofreader_count' mc_df = mc_df.rename(columns=new_names) print("Evaluating mito count results") results = compute_parallel(partial(_task_results, mito_res_scale_diff), iter_batches( mc_df.drop_duplicates('neighborhood_id'), 1), total=len(mc_df), processes=PROCESSES, leave_progress=True, ordered=False) cols = [ 'neighborhood_id', 'neighborhood_origin', 'proofreader_count', 'mito_id_count', 'mito_ids', 'mito_sizes', 'num_ccs', 'mito_cc_ids', 'mito_cc_sizes', 'ng_link' ] df = pd.DataFrame(results, columns=cols) # Add columns for cell type (from neuprint) print("Fetching neuron cell types") origins_df = pd.DataFrame(df['neighborhood_origin'].tolist(), columns=[*'xyz']) df['body'] = fetch_labels_batched(*NEURON_SEG, origins_df[[*'zyx']].values, processes=8) neurons_df, _ = fetch_neurons(df['body'].unique()) neurons_df = neurons_df.rename(columns={ 'bodyId': 'body', 'type': 'body_type', 'instance': 'body_instance' }) df = df.merge(neurons_df[['body', 'body_type', 'body_instance']], 'left', on='body') df['body_type'].fillna("", inplace=True) df['body_instance'].fillna("", inplace=True) # Append roi column print("Determining ROIs") determine_point_rois(*NEURON_SEG[:2], NEUPRINT_CLIENT.primary_rois, origins_df) df['roi'] = origins_df['roi'] # Results only path = 'mito-seg-counts.pkl' print(f"Writing {path}") with open(path, 'wb') as f: pickle.dump(df, f) path = 'mito-seg-counts.tab-delimited.csv' print(f"Writing {path}") df.to_csv(path, sep='\t', header=True, index=False) # Full results (with task info columns) df = df.merge( mc_df.drop(columns=['neighborhood_origin', 'proofreader_count']), 'left', on='neighborhood_id') path = 'full-results-with-mito-seg-counts.pkl' print(f"Writing {path}") with open(path, 'wb') as f: pickle.dump(df, f) path = 'full-results-with-mito-seg-counts.tab-delimited.csv' print(f"Writing {path}") df.to_csv(path, sep='\t', header=True, index=False) print("DONE")
def execute(self): self.init_services() left_service = self.left_service right_service = self.right_service options = self.config["contingencytable"] left_is_supervoxels = False if isinstance(left_service.base_service, DvidVolumeService): left_is_supervoxels = left_service.base_service.supervoxels left_roi = options["left-roi"] left_subset_labels = load_body_list(options["left-subset-labels"], left_is_supervoxels) left_subset_labels = set(left_subset_labels) sparse_fetch = not options["skip-sparse-fetch"] # Boxes are determined by the left volume/labels/roi boxes = self.init_boxes(left_service, sparse_fetch and left_subset_labels, left_roi) def _contingency_table(box): left_vol = left_service.get_subvolume(box) right_vol = right_service.get_subvolume(box) table = contingency_table(left_vol, right_vol) return table.reset_index() batch_tables = [] batches = iter_batches(boxes, options["batch-size"]) logger.info( f"Computing contingency tables for {len(boxes)} bricks in total.") logger.info( f"Processing {len(batches)} batches of {options['batch-size']} bricks each." ) for batch_index, batch_boxes in enumerate(batches): with Timer(f"Batch {batch_index}: Computing tables", logger): # Aim for 4 partitions per worker total_cores = sum(self.client.ncores().values()) tables = (db.from_sequence( batch_boxes, npartitions=4 * total_cores).map(_contingency_table).compute()) table = pd.concat(tables, ignore_index=True).sort_values( ['left', 'right']).reset_index(drop=True) table = table.groupby(['left', 'right'], as_index=False, sort=False)['voxel_count'].sum() batch_tables.append(table) with Timer("Constructing final table", logger): final_table = pd.concat(batch_tables, ignore_index=True).sort_values( ['left', 'right']).reset_index(drop=True) final_table = final_table.groupby(['left', 'right'], as_index=False, sort=False)['voxel_count'].sum() with Timer("Writing contingency_table.npy", logger): np.save('contingency_table.npy', final_table.to_records(index=False))
def execute(self): options = self.config["mitodistances"] output_dir = self.config["output-directory"] body_svc, mito_svc = self.init_services() # Resource manager context must be initialized before resource manager client # (to overwrite config values as needed) dvid_mgr_config = self.config["dvid-access-manager"] dvid_mgr_context = LocalResourceManager(dvid_mgr_config) dvid_mgr_client = ResourceManagerClient(dvid_mgr_config["server"], dvid_mgr_config["port"]) syn_server, syn_uuid, syn_instance = (options['synapse-criteria'][k] for k in ('server', 'uuid', 'instance')) syn_conf = float(options['synapse-criteria']['confidence']) syn_types = ['PreSyn', 'PostSyn'] if options['synapse-criteria']['type'] == 'pre': syn_types = ['PreSyn'] elif options['synapse-criteria']['type'] == 'post': syn_types = ['PostSyn'] bodies = load_body_list(options["bodies"], False) skip_flags = [ os.path.exists(f'{output_dir}/{body}.csv') for body in bodies ] bodies_df = pd.DataFrame({'body': bodies, 'should_skip': skip_flags}) bodies = bodies_df.query('not should_skip')['body'] # Shuffle for better load balance? # TODO: Would be better to sort by synapse count, and put large bodies first, # assigned to partitions in round-robin style. # Then work stealing will be more effective at knocking out the smaller jobs at the end. # This requires knowing all the body sizes, though. # Perhaps mito count would be a decent proxy for synapse count, and it's readily available. #bodies = bodies.sample(frac=1.0).values os.makedirs('body-logs') os.makedirs(output_dir, exist_ok=True) mito_server, mito_uuid, mito_instance = (options['mito-labelmap'][k] for k in ('server', 'uuid', 'instance')) @auto_retry(3) def _fetch_synapses(body): with dvid_mgr_client.access_context(syn_server, True, 1, 1): syn_df = fetch_annotation_label(syn_server, syn_uuid, syn_instance, body, format='pandas') if len(syn_df) == 0: return syn_df syn_types, syn_conf syn_df = syn_df.query( 'kind in @syn_types and conf >= @syn_conf').copy() return syn_df[[*'xyz', 'kind', 'conf' ]].sort_values([*'xyz']).reset_index(drop=True) @auto_retry(3) def _fetch_mito_ids(body): with dvid_mgr_client.access_context(mito_server, True, 1, 1): try: return fetch_supervoxels(mito_server, mito_uuid, mito_instance, body) except HTTPError: return [] def process_and_save(body): tbars = _fetch_synapses(body) valid_mitos = _fetch_mito_ids(body) # TODO: # Does the stdout_redirected() mechanism work correctly in the context of multiprocessing? # If not, I should probably just use a custom logging handler instead. with open(f"body-logs/{body}.log", "w") as f, stdout_redirected(f), Timer() as timer: processed_tbars = [] if len(tbars) == 0: logging.getLogger(__name__).warning( f"Body {body}: No synapses found") if len(valid_mitos) == 0: logging.getLogger(__name__).warning( f"Body {body}: Failed to fetch mito supervoxels") processed_tbars = initialize_results(body, tbars) if len(valid_mitos) and len(tbars): processed_tbars = measure_tbar_mito_distances( body_svc, mito_svc, body, tbars=tbars, valid_mitos=valid_mitos) if len(processed_tbars) > 0: processed_tbars.to_csv(f'{output_dir}/{body}.csv', header=True, index=False) with open(f'{output_dir}/{body}.pkl', 'wb') as f: pickle.dump(processed_tbars, f) if len(tbars) == 0: return (body, 0, 'no-synapses', timer.seconds) if len(valid_mitos) == 0: return (body, len(processed_tbars), 'no-mitos', timer.seconds) return (body, len(tbars), 'success', timer.seconds) logger.info( f"Processing {len(bodies)}, skipping {bodies_df['should_skip'].sum()}" ) def process_batch(bodies): return [*map(process_and_save, bodies)] with dvid_mgr_context: batch_size = max(1, len(bodies) // 10_000) futures = self.client.map(process_batch, iter_batches(bodies, batch_size)) # Support synchronous testing with a fake 'as_completed' object if hasattr(self.client, 'DEBUG'): ac = as_completed_synchronous(futures, with_results=True) else: ac = distributed.as_completed(futures, with_results=True) try: results = [] for f, r in tqdm_proxy(ac, total=len(futures)): results.extend(r) finally: results = pd.DataFrame( results, columns=['body', 'synapses', 'status', 'processing_time']) results.to_csv('results-summary.csv', header=True, index=False) num_errors = len(results.query('status == "error"')) if num_errors: logger.warning( f"Encountered {num_errors} errors. See results-summary.csv" )
def execute(self): scale = self._init_service() options = self.config["roistats"] server = self.input_service.base_service.server uuid = self.input_service.base_service.uuid rois = options["rois"] bodies = load_body_list(options["subset-bodies"], self.input_service.base_service.supervoxels) assert len( bodies) > 0, "Please provide a list of subset-bodies to process" bounding_box = self.input_service.bounding_box_zyx assert not (bounding_box % 2**(5-scale)).any(), \ "Make sure your configured bounding box is divisible by 32px at scale 0" brick_shape = self.input_service.preferred_message_shape assert not (brick_shape % 2**(5-scale)).any(), \ "Make sure your preferred message shape divides into 32px blocks at scale 0" with Timer("Fetching ROI volume", logger): roi_vol_s5, roi_box_s5, overlaps = fetch_combined_roi_volume( server, uuid, rois, False, bounding_box // 2**(5 - scale)) if len(overlaps) > 0: logger.warn( f"Some of your ROIs overlap! Here's an incomplete list:\n{overlaps}" ) with Timer("Determining brick set", logger): brick_coords_df = self.input_service.sparse_brick_coords_for_labels( bodies) np.save('brick-coords.npy', brick_coords_df.to_records(index=False)) with Timer(f"Preparing bricks", logger): boxes_and_roi_bricks = [] for coord, labels in brick_coords_df.groupby( [*'zyx'])['label'].agg(tuple).iteritems(): box = np.array((coord, coord)) box[1] += brick_shape box = box_intersection(box, bounding_box) roi_brick_box = ((box // 2**(5 - scale)) - roi_box_s5[0]) roi_brick_s5 = extract_subvol(roi_vol_s5, roi_brick_box) boxes_and_roi_bricks.append((box, roi_brick_s5, labels)) logger.info( f"Prepared {len(boxes_and_roi_bricks)} bricks of shape {(*brick_shape[::-1],)}" ) all_stats = [] batches = [*iter_batches(boxes_and_roi_bricks, options["batch-size"])] logger.info(f"Processing {len(batches)} batches") for i, batch_boxes_and_bricks in enumerate(batches): with Timer(f"Batch {i:02d}", logger): batch_stats = self._execute_batch(scale, batch_boxes_and_bricks) all_stats.append(batch_stats) all_stats = pd.concat(all_stats, ignore_index=True) all_stats = all_stats.groupby(['body', 'roi_id'], as_index=False)['voxels'].sum() roi_names = pd.Series(["<none>", *rois], name='roi') roi_names.index.name = 'roi_id' all_stats = all_stats.merge(roi_names, 'left', on='roi_id') all_stats = all_stats.sort_values(['body', 'roi_id']) if scale > 0: all_stats.rename(columns={'voxels': f'voxels_s{scale}'}, inplace=True) with Timer(f"Writing stats ({len(all_stats)} rows)", logger): np.save('roi-stats.npy', all_stats.to_records(index=False)) all_stats.to_csv('roi-stats.csv', index=False, header=True)
def execute(self): self.init_services() primary_service = self.primary_service contingency_service = self.contingency_service output_service = self.output_service options = self.config["contingentrelabel"] primary_is_supervoxels = False if isinstance(primary_service.base_service, DvidVolumeService): primary_is_supervoxels = primary_service.base_service.supervoxels roi = options["roi"] subset_labels = load_body_list(options["subset-labels"], primary_is_supervoxels) subset_labels = set(subset_labels) sparse_fetch = not options["skip-sparse-fetch"] # Boxes are determined by the primary volume/labels/roi boxes = self.init_boxes(primary_service, sparse_fetch and subset_labels, roi) batches = iter_batches(boxes, options["batch-size"]) logger.info(f"Relabeling {len(boxes)} bricks in total.") logger.info( f"Processing {len(batches)} batches of {options['batch-size']} bricks each." ) def _contingent_relabel(box): primary_vol = primary_service.get_subvolume(box) primary_vol = np.ascontiguousarray(primary_vol) contingency_vol = contingency_service.get_subvolume(box) contingency_vol = np.ascontiguousarray(contingency_vol) # Get the set of labels in this box, so we can discard irrelevant portions of the mapping. _primary_labels = pd.unique(primary_vol.reshape(-1)) # noqa _contingency_labels = pd.unique( contingency_vol.reshape(-1)) # noqa cm_path = options["contingent-mapping"] if cm_path.endswith('.npy'): _cm = np.load(options["contingent-mapping"]) elif cm_path.endswith('.pkl'): _cm = pickle.load(open(cm_path, 'rb')) else: raise RuntimeError( f"Don't know how to open mapping file: {cm_path}") cm_df = pd.DataFrame(_cm) assert {*cm_df.columns} == {'primary', 'contingency', 'final'} # Keep only the parts of the mapping we need for this box, # just for the sake of performance in the merge below. cm_df = cm_df.query( 'primary in @_primary_labels and contingency in @_contingency_labels' ).copy() cm_df['primary'] = cm_df['primary'].astype(primary_vol.dtype) cm_df['contingency'] = cm_df['contingency'].astype( contingency_vol.dtype) # Use a merge to essentially map from (primary, contingency) -> final input_df = pd.DataFrame({ 'primary': primary_vol.reshape(-1), 'contingency': contingency_vol.reshape(-1) }) input_df = input_df.merge(cm_df, 'left', on=['primary', 'contingency']) input_df['final'] = input_df['final'].fillna(input_df['primary']) input_df['final'] = input_df['final'].astype(primary_vol.dtype) final_vol = input_df['final'].values.reshape(primary_vol.shape) del input_df output_service.write_subvolume(final_vol, box[0]) for batch_index, batch_boxes in enumerate(batches): with Timer(f"Batch {batch_index}: Relabeling", logger): # Aim for 4 partitions per worker total_cores = sum(self.client.ncores().values()) (db.from_sequence( batch_boxes, npartitions=4 * total_cores).map(_contingent_relabel).compute())
def execute(self): self.init_services() left_service = self.left_service right_service = self.right_service options = self.config["contingencytable"] left_is_supervoxels = False if isinstance(left_service.base_service, DvidVolumeService): left_is_supervoxels = left_service.base_service.supervoxels left_roi = options["left-roi"] left_subset_labels = load_body_list(options["left-subset-labels"], left_is_supervoxels) sparse_fetch = not options["skip-sparse-fetch"] min_overlap = options["min-overlap-size"] # Boxes are determined by the left volume/labels/roi boxes = self.init_boxes(left_service, sparse_fetch and set(left_subset_labels), left_roi) def _contingency_table(box): left_vol = left_service.get_subvolume(box) right_vol = right_service.get_subvolume(box) table = contingency_table(left_vol, right_vol) table = table.sort_index().reset_index() # Compute sizes before filtering left_sizes = table.groupby('left')['voxel_count'].sum() right_sizes = table.groupby('right')['voxel_count'].sum() if len(left_subset_labels) > 0: # We keep rows if they match either of these criteria: # 1. they touch a left-subset label # 2. they touch a left label that intersects with one # of the right labels from criteria 1. keep_left = left_sizes.index.intersection( left_subset_labels) # noqa keep_right = table.query( 'left in @keep_left')['right'].unique() # noqa table = table.query( 'left in @keep_left or right in @keep_right') if min_overlap > 1: table = table.query('voxel_count >= @min_overlap') left_sizes = left_sizes.loc[table['left'].unique()].reset_index() right_sizes = right_sizes.loc[ table['right'].unique()].reset_index() return table, left_sizes, right_sizes batch_tables = [] batch_left_sizes = [] batch_right_sizes = [] batches = iter_batches(boxes, options["batch-size"]) logger.info( f"Computing contingency tables for {len(boxes)} bricks in total.") logger.info( f"Processing {len(batches)} batches of {options['batch-size']} bricks each." ) for batch_index, batch_boxes in enumerate(batches): with Timer(f"Batch {batch_index}: Computing tables", logger): # Aim for 4 partitions per worker total_cores = sum(self.client.ncores().values()) results = (db.from_sequence( batch_boxes, npartitions=4 * total_cores).map(_contingency_table).compute()) tables, left_sizes, right_sizes = zip(*results) table = pd.concat(tables, ignore_index=True).sort_values( ['left', 'right']).reset_index(drop=True) table = table.groupby(['left', 'right'], as_index=False, sort=False)['voxel_count'].sum() left_sizes = pd.concat(left_sizes, ignore_index=True).groupby( 'left')['voxel_count'].sum().reset_index() right_sizes = pd.concat( right_sizes, ignore_index=True).groupby( 'right')['voxel_count'].sum().reset_index() batch_tables.append(table) batch_left_sizes.append(left_sizes) batch_right_sizes.append(right_sizes) with Timer("Constructing final tables", logger): final_table = pd.concat(batch_tables, ignore_index=True).sort_values( ['left', 'right']).reset_index(drop=True) final_table = final_table.groupby(['left', 'right'], as_index=False, sort=False)['voxel_count'].sum() final_left_sizes = pd.concat( batch_left_sizes, ignore_index=True).groupby('left')['voxel_count'].sum() final_right_sizes = pd.concat( batch_right_sizes, ignore_index=True).groupby('right')['voxel_count'].sum() def dump_table(table, p): with Timer(f"Writing {p}", logger), open(p, 'wb') as f: pickle.dump(table, f) dump_table(final_table, 'contingency_table.pkl') dump_table(final_left_sizes, 'left_sizes.pkl') dump_table(final_right_sizes, 'right_sizes.pkl')
def execute(self): self._init_service() options = self.config["roistats"] if not options["roi-server"]: assert isinstance(self.input_service, DvidVolumeService) options["roi-server"] = self.input_service.base_service.server if not options["roi-uuid"]: assert isinstance(self.input_service, DvidVolumeService) options["roi-uuid"] = self.input_service.base_service.uuid options["roi-uuid"] = resolve_ref(options["roi-server"], options["roi-uuid"]) is_supervoxels = (isinstance(self.input_service, DvidVolumeService) and self.input_service.base_service.supervoxels ) # noqa bodies = load_body_list(options["subset-bodies"], is_supervoxels) assert len( bodies) > 0, "Please provide a list of subset-bodies to process" scale = options["analysis-scale"] bounding_box = self.input_service.bounding_box_zyx assert not (bounding_box % 2**5).any(), \ "Make sure your configured bounding box is divisible by 32px at scale 0." brick_shape = self.input_service.preferred_message_shape assert not (brick_shape % 2**5).any(), \ "Make sure your preferred message shape divides into 32px blocks at scale 0" with Timer("Fetching ROI volume", logger): roi_vol_s5, roi_box_s5, overlaps = fetch_combined_roi_volume( options["roi-server"], options["roi-uuid"], options["rois"], False, bounding_box // 2**5) if len(overlaps) > 0: logger.warn( f"Some of your ROIs overlap! Here's an incomplete list:\n{overlaps}" ) with Timer("Determining brick set", logger): # Determine which bricks intersect our ROIs roi_brick_shape = self.input_service.preferred_message_shape // 2**5 roi_brick_boxes = boxes_from_mask((roi_vol_s5 != 0), roi_box_s5[0], roi_brick_shape, clipped=False) roi_brick_boxes *= 2**5 roi_brick_boxes = box_intersection( roi_brick_boxes, self.input_service.bounding_box_zyx) # Non-intersecting boxes have negative shape -- drop them. roi_brick_boxes = roi_brick_boxes[( (roi_brick_boxes[:, 1, :] - roi_brick_boxes[:, 0, :]) > 0).all( axis=1)] roi_brick_coords_df = pd.DataFrame(roi_brick_boxes[:, 0, :], columns=[*'zyx']) try: body_brick_coords_df = self.input_service.sparse_brick_coords_for_labels( bodies) except NotImplementedError: # Use all bricks in the ROIs, and use the special label -1 to # indicate that all bodies in the list might be found there. # (See below.) brick_coords_df = roi_brick_coords_df brick_coords_df['label'] = -1 else: brick_coords_df = body_brick_coords_df.merge( roi_brick_coords_df, 'inner', on=[*'zyx']) assert brick_coords_df.columns.tolist() == [*'zyx', 'label'] np.save('brick-coords.npy', brick_coords_df.to_records(index=False)) with Timer("Preparing bricks", logger): boxes_and_roi_bricks = [] for coord, brick_labels in brick_coords_df.groupby( [*'zyx'])['label'].agg(tuple).iteritems(): if brick_labels == (-1, ): # No sparse body brick locations were found above. # Search for all bodies in all bricks. brick_labels = bodies box = np.array((coord, coord)) box[1] += brick_shape box = box_intersection(box, bounding_box) roi_brick_box = ((box // 2**5) - roi_box_s5[0]) roi_brick_s5 = extract_subvol(roi_vol_s5, roi_brick_box) boxes_and_roi_bricks.append((box, roi_brick_s5, brick_labels)) scaled_shape = brick_shape // (2**scale) logger.info( f"Prepared {len(boxes_and_roi_bricks)} bricks of scale-0 shape " f"{(*brick_shape[::-1],)} ({(*scaled_shape[::-1],)} at scale-{scale})" ) all_stats = [] batches = [*iter_batches(boxes_and_roi_bricks, options["batch-size"])] logger.info(f"Processing {len(batches)} batches") for i, batch_boxes_and_bricks in enumerate(batches): with Timer(f"Batch {i:02d}", logger): batch_stats = self._execute_batch(scale, batch_boxes_and_bricks) all_stats.append(batch_stats) all_stats = pd.concat(all_stats, ignore_index=True) all_stats = all_stats.groupby(['body', 'roi_id'], as_index=False)['voxels'].sum() roi_names = pd.Series(["<none>", *options["rois"]], name='roi') roi_names.index.name = 'roi_id' all_stats = all_stats.merge(roi_names, 'left', on='roi_id') all_stats = all_stats.sort_values(['body', 'roi_id']) if scale > 0: all_stats.rename(columns={'voxels': f'voxels_s{scale}'}, inplace=True) with Timer(f"Writing stats ({len(all_stats)} rows)", logger): np.save('roi-stats.npy', all_stats.to_records(index=False)) all_stats.to_csv('roi-stats.csv', index=False, header=True)