def test_post_hierarchical_cleaves(labelmap_setup): dvid_server, dvid_repo, _merge_table_path, _mapping_path, _supervoxel_vol = labelmap_setup uuid = post_branch(dvid_server, dvid_repo, 'segmentation-post_hierarchical_cleaves', '') instance_info = dvid_server, uuid, 'segmentation-post_hierarchical_cleaves' create_labelmap_instance(*instance_info) svs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] groups = [1, 1, 2, 2, 3, 3, 3, 3, 3, 4] svs = np.asarray(svs, np.uint64) # Post some supervoxels in multiple blocks, just to prove that post_hierarchical_cleaves() # doesn't assume that the labelindex has the same length as the mapping. sv_vol = np.zeros((128, 64, 64), np.uint64) sv_vol[0, 0, :len(svs)] = svs sv_vol[64, 0, 0:len(svs):2] = svs[::2] post_labelmap_voxels(*instance_info, (0, 0, 0), sv_vol) post_merge(*instance_info, 1, svs[1:]) group_mapping = pd.Series(index=svs, data=groups) final_table = post_hierarchical_cleaves(*instance_info, 1, group_mapping) assert (fetch_mapping(*instance_info, svs) == final_table['body'].values).all() assert (final_table.drop_duplicates( ['group']) == final_table.drop_duplicates(['group', 'body'])).all().all() assert (final_table.drop_duplicates( ['body']) == final_table.drop_duplicates(['group', 'body'])).all().all() # Since the mapping included all supervoxels in the body, # the last group is left with the original label. assert final_table.iloc[-1]['body'] == 1 # Now merge them all together and try again, but leave # two supevoxels out of the groups this time. merges = set(pd.unique(final_table['body'].values)) - set([1]) post_merge(*instance_info, 1, list(merges)) group_mapping = pd.Series(index=svs[:-2], data=groups[:-2]) final_table = post_hierarchical_cleaves(*instance_info, 1, group_mapping) assert len( final_table.query('body == 1') ) == 0, "Did not expect any of the groups to retain the original body ID!" assert (fetch_mapping(*instance_info, svs[:-2]) == final_table['body'].values).all() assert (final_table.drop_duplicates( ['group']) == final_table.drop_duplicates(['group', 'body'])).all().all() assert (final_table.drop_duplicates( ['body']) == final_table.drop_duplicates(['group', 'body'])).all().all() assert (fetch_mapping(*instance_info, svs[-2:]) == 1).all()
def cleave_supervoxels_as_isolated_bodies(instance_info, sv_ids): """ Separate the given supervoxels from their enclosing bodies by cleaving them out as their own independent, single-supervoxel bodies. """ logger.info("Fetching mapping for each SV") body_ids = fetch_mapping(*instance_info, sv_ids, as_series=True) logger.info("Performing cleaves") cleaved_ids = [] for sv_id, body_id in tqdm(list(zip(sv_ids, body_ids))): try: cleaved_body = post_cleave(*instance_info, body_id, [sv_id]) except requests.RequestException as ex: if 'cannot cleave all supervoxels from the label' in ex.response.content.decode( ): # Body has only one supervoxel to begin with cleaved_body = body_id else: sys.stderr.write(ex.response.content.decode()) raise cleaved_ids.append(cleaved_body) return list(zip(sv_ids, body_ids, cleaved_ids))
def main(): # Hard-coded parameters prod = 'emdata4:8900' master = (prod, find_master(prod)) master_seg = (*master, 'segmentation') # I accidentally corrupted the labelindex of bodies in this region patch_box = 20480 + np.array([[0, 0, 0], [1024, 1024, 1024]]) with Timer("Fetching supervoxels", logger): boxes = boxes_from_grid(patch_box, Grid((64, 64, 6400)), clipped=True) sv_sets = compute_parallel(partial(_fetch_svs, master_seg), boxes, processes=32, ordered=False, leave_progress=True) svs = set(chain(*sv_sets)) - set([0]) bodies = set(fetch_mapping(*master_seg, svs)) with Timer(f"Repairing {len(bodies)} labelindexes", logger): compute_parallel(partial(_repair_index, master_seg), bodies, processes=32, ordered=False, leave_progress=True) print("DONE.")
def extract_body_ids_and_launch(c, args, seg_instance, body_csv, msgs_df): """ Extract the list of body IDs from the given kafka messages, overwrite the body list CSV file in the workflow template directory, and submit a cluster job to launch the workflow. """ # Late imports so --help works quickly import numpy as np import pandas as pd from neuclease.dvid import resolve_ref, fetch_mapping, compute_affected_bodies if len(msgs_df) == 0: return False exclude_bodies = load_bad_bodies() # If the uuid was specified as a branch, # resolve it to a specific uuid now. server, uuid, instance = seg_instance uuid = resolve_ref(server, uuid) # Extract all bodies and supervoxels that have been touched in the kafka log new_bodies, changed_bodies, _removed_bodies, new_supervoxels = compute_affected_bodies( msgs_df['msg']) # For touched supervoxels, we need to find their mapped bodies. sv_split_bodies = set( fetch_mapping(server, uuid, instance, new_supervoxels)) - set([0]) subset_bodies = set(chain(new_bodies, changed_bodies, sv_split_bodies)) subset_bodies -= set(exclude_bodies) subset_bodies = np.fromiter(subset_bodies, np.uint64) subset_bodies = np.sort(subset_bodies).tolist() if len(subset_bodies) == 0: return False # Overwrite the CSV file for the workflow's subset-bodies set. pd.Series(subset_bodies, name='body').to_csv(f'{args.template_dir}/{body_csv}', header=True, index=False) first_timestamp = msgs_df['timestamp'].iloc[0] last_timestamp = msgs_df['timestamp'].iloc[-1] logger.info(f"Launching mesh computation for {len(subset_bodies)} bodies, " f"modified between [{first_timestamp}] and [{last_timestamp}]") # FIXME: Instead of hard-coding -W to one hour, read the template dask-config.yaml cmd = ( f"source $({args.conda_path} info --base)/bin/activate {args.conda_env} " f"&& cd {args.cwd} " f"&& bsub -W 01:00 -n {args.driver_slots} -o /dev/null launchflow -n {args.worker_slots} {args.template_dir}" ) run_cmd(c, cmd) return True
def adjust_focused_points(server, uuid, instance, assignment_json_data, supervoxels=True, max_search_scale=3): new_assignment_data = copy.deepcopy(assignment_json_data) new_tasks = new_assignment_data["task list"] for task in tqdm_proxy(new_tasks): sv_1 = task["supervoxel ID 1"] sv_2 = task["supervoxel ID 2"] coord_1 = np.array(task["supervoxel point 1"]) coord_2 = np.array(task["supervoxel point 2"]) if supervoxels: label_1 = sv_1 label_2 = sv_2 else: label_1, label_2 = fetch_mapping(server, uuid, instance, [sv_1, sv_2], as_series=True) avg_coord = (coord_1 + coord_2) // 2 # Search until we find a scale in which the two touch, or give up. for scale in range(1 + max_search_scale): box_xyz = (avg_coord // (2**scale) - 64, avg_coord // (2**scale) + 64) box_zyx = np.array(box_xyz)[:, ::-1] seg_vol = fetch_labelarray_voxels(server, uuid, instance, box_zyx, scale, supervoxels=supervoxels) adjusted_coords_zyx = find_best_plane(seg_vol, label_1, label_2) adjusted_coords_zyx = np.array(adjusted_coords_zyx) if not (adjusted_coords_zyx == -1).all(): # Found it. adjusted_coords_zyx += box_zyx[0] adjusted_coords_zyx *= (2**scale) break if (adjusted_coords_zyx == -1).all(): task["coordinate-status"] = "misplaced" else: task["supervoxel point 1"] = adjusted_coords_zyx[0, ::-1].tolist() task["supervoxel point 2"] = adjusted_coords_zyx[1, ::-1].tolist() task["coordinate-status"] = f"adjusted-at-scale-{scale}" return new_assignment_data
def determine_changed_labelmap_bodies(self, kafka_timestamp_string): """ Read the entire labelmap kafka log, and determine which bodies have changed since the given timestamp (a string). Example timestamps: - "2018-11-22" - "2018-11-22 17:34:00" Returns: list of body IDs """ logger.info( f"Determining which bodies have changed since {kafka_timestamp_string}" ) try: kafka_timestamp = parse_timestamp(kafka_timestamp_string) except: raise RuntimeError( f"Could not parse your subset-bodies config setting ({kafka_timestamp_string}) " "as either a body list or a kafka timestamp") seg_instance = self.instance_triple kafka_msgs = read_kafka_messages(*seg_instance) filtered_kafka_msgs = filter_kafka_msgs_by_timerange( kafka_msgs, min_timestamp=kafka_timestamp) new_bodies, changed_bodies, _removed_bodies, new_supervoxels = compute_affected_bodies( filtered_kafka_msgs) sv_split_bodies = set(fetch_mapping(*seg_instance, new_supervoxels)) - set([0]) subset_bodies = set(chain(new_bodies, changed_bodies, sv_split_bodies)) subset_bodies = np.fromiter(subset_bodies, np.uint64) subset_bodies = np.sort(subset_bodies).tolist() logger.info( f"The kafka log shows that {len(subset_bodies)} bodies have changed since ({kafka_timestamp_string})" ) return subset_bodies
def test_masksegmentation_basic(setup_dvid_segmentation_input, invert_mask, roi_dilation, disable_auto_retry): template_dir, config, volume, dvid_address, repo_uuid, roi_mask_s5, input_segmentation_name, output_segmentation_name = setup_dvid_segmentation_input if invert_mask: roi_mask_s5 = ~roi_mask_s5 config["masksegmentation"]["invert-mask"] = invert_mask config["masksegmentation"]["dilate-roi"] = roi_dilation # re-dump config yaml = YAML() yaml.default_flow_style = False with open(f"{template_dir}/workflow.yaml", 'w') as f: yaml.dump(config, f) execution_dir, workflow = launch_flow(template_dir, 1) final_config = workflow.config input_box_xyz = np.array(final_config['input']['geometry']['bounding-box']) input_box_zyx = input_box_xyz[:, ::-1] roi_mask = upsample(roi_mask_s5, 2**5) roi_mask = extract_subvol(roi_mask, input_box_zyx) expected_vol = extract_subvol(volume.copy(), input_box_zyx) expected_vol[roi_mask] = 0 output_box_xyz = np.array( final_config['output']['geometry']['bounding-box']) output_box_zyx = output_box_xyz[:, ::-1] output_vol = fetch_labelmap_voxels(dvid_address, repo_uuid, output_segmentation_name, output_box_zyx, scale=0, supervoxels=True) # Create a copy of the volume that contains only the voxels we removed erased_vol = volume.copy() erased_vol[~roi_mask] = 0 if EXPORT_DEBUG_FILES: original_vol = fetch_labelmap_voxels(dvid_address, repo_uuid, input_segmentation_name, output_box_zyx, scale=0, supervoxels=True) original_agglo_vol = fetch_labelmap_voxels(dvid_address, repo_uuid, input_segmentation_name, output_box_zyx, scale=0) output_agglo_vol = fetch_labelmap_voxels(dvid_address, repo_uuid, output_segmentation_name, output_box_zyx, scale=0) np.save('/tmp/original-svs.npy', original_vol) np.save('/tmp/original-agglo.npy', original_agglo_vol) np.save('/tmp/output.npy', output_vol) np.save('/tmp/output-agglo.npy', output_agglo_vol) np.save('/tmp/expected.npy', expected_vol) np.save('/tmp/erased.npy', erased_vol) shutil.copyfile(f'{execution_dir}/roi-mask.h5', '/tmp/roi-mask.h5') if roi_dilation: shutil.copyfile(f'{execution_dir}/dilated-roi-mask.h5', '/tmp/dilated-roi-mask.h5') if invert_mask: shutil.copyfile(f'{execution_dir}/segmentation-mask.h5', '/tmp/segmentation-mask.h5') shutil.copyfile(f'{execution_dir}/final-mask.h5', '/tmp/final-mask.h5') if roi_dilation > 0: # FIXME: We don't yet verify voxel-accuracy of ROI dilation. return assert (output_vol == expected_vol).all(), \ "Written vol does not match expected" scaled_expected_vol = expected_vol for scale in range(1, 1 + MAX_SCALE): scaled_expected_vol = downsample(scaled_expected_vol, 2, 'labels-numba') scaled_output_vol = fetch_labelmap_voxels(dvid_address, repo_uuid, output_segmentation_name, output_box_zyx // 2**scale, scale=scale, supervoxels=True) if EXPORT_DEBUG_FILES: np.save(f'/tmp/expected-{scale}.npy', scaled_expected_vol) np.save(f'/tmp/expected-{scale}.npy', scaled_expected_vol) np.save(f'/tmp/output-{scale}.npy', scaled_output_vol) if scale <= 5: assert (scaled_output_vol == scaled_expected_vol).all(), \ f"Written vol does not match expected at scale {scale}" else: # For scale 6 and 7, some blocks are not even changed, # but that means we would be comparing DVID's label # downsampling method to our method ('labels-numba'). # The two don't necessarily give identical results in the case of 'ties', # so we'll just verify that the nonzero voxels match, at least. assert ((scaled_output_vol == 0) == (scaled_expected_vol == 0)).all(), \ f"Written vol does not match expected at scale {scale}" block_stats_path = f'{execution_dir}/erased-block-statistics.h5' with h5py.File(block_stats_path, 'r') as f: stats_df = pd.DataFrame(f['stats'][:]) # # Check the exported block statistics # stats_cols = [*BLOCK_STATS_DTYPES.keys()] assert stats_df.columns.tolist() == stats_cols stats_df = stats_df.sort_values(stats_cols).reset_index() expected_stats_df = block_stats_for_volume((64, 64, 64), erased_vol, input_box_zyx) expected_stats_df = expected_stats_df.sort_values(stats_cols).reset_index() assert len(stats_df) == len(expected_stats_df) assert (stats_df == expected_stats_df).all().all() # # Try updating the labelindexes # src_info = (dvid_address, repo_uuid, input_segmentation_name) dest_info = (dvid_address, repo_uuid, output_segmentation_name) with switch_cwd(execution_dir): erase_from_labelindexes(src_info, dest_info, block_stats_path, batch_size=10, threads=4) # Verify deleted supervoxels assert os.path.exists(f'{execution_dir}/deleted-supervoxels.csv') deleted_svs = set( pd.read_csv(f'{execution_dir}/deleted-supervoxels.csv')['sv']) orig_svs = {*pd.unique(volume.reshape(-1))} - {0} remaining_svs = {*pd.unique(expected_vol.reshape(-1))} - {0} expected_deleted_svs = orig_svs - remaining_svs assert deleted_svs == expected_deleted_svs # Verify remaining sizes expected_sv_counts = (pd.Series( expected_vol.reshape(-1), name='sv').value_counts().drop(0).sort_index().rename('count')) index_dfs = [] for body in np.unique(fetch_mapping(*dest_info, remaining_svs)): index_df = fetch_labelindex(*dest_info, body, format='pandas').blocks index_dfs.append(index_df) sv_counts = (pd.concat(index_dfs, ignore_index=True)[[ 'sv', 'count' ]].groupby('sv')['count'].sum().sort_index()) assert set(sv_counts.index.values) == set(expected_sv_counts.index.values) assert (sv_counts == expected_sv_counts).all(), \ pd.DataFrame({'stored_count': sv_counts, 'expected_count': expected_sv_counts}).query('stored_count != expected_count') # Verify mapping # Deleted supervoxels exist in the mapping, but they map to 0. assert (fetch_mapping(*dest_info, [*deleted_svs]) == 0).all() # Remaining supervoxels still map to their original bodies assert (fetch_mapping(*dest_info, [*remaining_svs]) == fetch_mapping( *src_info, [*remaining_svs])).all()
def sparse_brick_coords_for_labels(self, labels, clip=True): """ Return a DataFrame indicating the brick coordinates (starting corner) that encompass the given labels. Args: labels: A list of body IDs (if ``self.supervoxels`` is False), or supervoxel IDs (if ``self.supervoxels`` is True). clip: If True, filter the results to exclude any coordinates that fall outside this service's bounding-box. Otherwise, all brick coordinates that encompass the given labels will be returned, whether or not they fall within the bounding box. Returns: DataFrame with columns [z,y,x,label], where z,y,x represents the starting corner (in full-res coordinates) of a brick that contains the label. """ assert not isinstance(labels, set), "Pass labels as a list or array, not a set" labels = pd.unique(labels) is_supervoxels = self.supervoxels brick_shape = self.preferred_message_shape assert (brick_shape % self.block_width == 0).all(), \ ("Brick shape ('preferred-message-shape') must be a multiple of the " f"block width ({self.block_width}) in all dimensions, not {brick_shape}") bad_labels = [] if not is_supervoxels: # No supervoxel filtering. # Sort by body, since that should be slightly nicer for dvid performance. bodies_and_svs = {label: None for label in sorted(labels)} else: # Arbitrary heuristic for whether to do the body-lookups on DVID or on the client. if len(labels) < 100_000: # If we're only dealing with a few supervoxels, # ask dvid to map them to bodies for us. mapping = fetch_mapping(*self.instance_triple, labels, as_series=True) else: # If we're dealing with a lot of supervoxels, ask for # the entire mapping, and look up the bodies ourselves. complete_mapping = fetch_mappings(*self.instance_triple) mapper = LabelMapper(complete_mapping.index.values, complete_mapping.values) labels = np.asarray(labels, np.uint64) bodies = mapper.apply(labels, True) mapping = pd.Series(index=labels, data=bodies, name='body') mapping.index.rename('sv', inplace=True) bad_svs = mapping[mapping == 0] bad_labels.extend(bad_svs.index.tolist()) # Group by body mapping = mapping[mapping != 0] grouped_svs = mapping.reset_index().groupby('body').agg( {'sv': list})['sv'] # Sort by body, since that should be slightly nicer for dvid performance. bodies_and_svs = grouped_svs.sort_index().to_dict() # Extract these to avoid pickling 'self' (just for speed) server, uuid, instance = self.instance_triple if self._use_resource_manager_for_sparse_coords: mgr = self.resource_manager_client else: mgr = ResourceManagerClient("", 0) def fetch_brick_coords(body, supervoxel_subset): """ Fetch the block coordinates for the given body, filter them for the given supervoxels (if any), and convert the block coordinates to brick coordinates. """ assert is_supervoxels or supervoxel_subset is None try: with mgr.access_context(server, True, 1, 1): labelindex = fetch_labelindex(server, uuid, instance, body, 'protobuf') coords_df = convert_labelindex_to_pandas(labelindex).blocks except HTTPError as ex: if (ex.response is not None and ex.response.status_code == 404): return (body, None) raise except RuntimeError as ex: if 'does not map to any body' in str(ex): return (body, None) raise if len(coords_df) == 0: return (body, None) if is_supervoxels: supervoxel_subset = set(supervoxel_subset) coords_df = coords_df.query('sv in @supervoxel_subset').copy() coords_df[['z', 'y', 'x']] //= brick_shape coords_df['body'] = np.uint64(body) coords_df.drop_duplicates(inplace=True) return (body, coords_df) def fetch_and_concatenate_brick_coords(bodies_and_supervoxels): """ To reduce the number of tiny DataFrames collected to the driver, it's best to concatenate the partitions first, on the workers, rather than a straightforward call to starmap(fetch_brick_coords). Hence, this function that consolidates each partition. """ bad_bodies = [] coord_dfs = [] for (body, supervoxel_subset) in bodies_and_supervoxels: _, coords_df = fetch_brick_coords(body, supervoxel_subset) if coords_df is None: bad_bodies.append(body) else: coord_dfs.append(coords_df) del coords_df if coord_dfs: return [(pd.concat(coord_dfs, ignore_index=True), bad_bodies)] else: return [(None, bad_bodies)] with Timer( f"Fetching coarse sparsevols for {len(labels)} labels ({len(bodies_and_svs)} bodies)", logger=logger): import dask.bag as db coords_and_bad_bodies = ( db.from_sequence( bodies_and_svs.items(), npartitions=4096 ) # Instead of fancy heuristics, just pick 4096 .map_partitions(fetch_and_concatenate_brick_coords).compute()) coords_df_partitions, bad_body_partitions = zip(*coords_and_bad_bodies) for body in chain(*bad_body_partitions): if is_supervoxels: bad_labels.extend(bodies_and_svs[body]) else: bad_labels.append(body) if bad_labels: name = 'sv' if is_supervoxels else 'body' pd.Series(bad_labels, name=name).to_csv('labels-without-sparsevols.csv', index=False, header=True) if len(bad_labels) < 100: msg = f"Could not obtain coarse sparsevol for {len(bad_labels)} labels: {bad_labels}" else: msg = f"Could not obtain coarse sparsevol for {len(bad_labels)} labels. See labels-without-sparsevols.csv" logger.warning(msg) coords_df_partitions = list( filter(lambda df: df is not None, coords_df_partitions)) if len(coords_df_partitions) == 0: raise RuntimeError( "Could not find bricks for any of the given labels") coords_df = pd.concat(coords_df_partitions, ignore_index=True) if self.supervoxels: coords_df['label'] = coords_df['sv'] else: coords_df['label'] = coords_df['body'] coords_df.drop_duplicates(['z', 'y', 'x', 'label'], inplace=True) coords_df[['z', 'y', 'x']] *= brick_shape if clip: # Keep if the last pixel in the brick is to the right of the bounding-box start # and the first pixel in the brick is to the left of the bounding-box stop keep = (coords_df[['z', 'y', 'x']] + brick_shape > self.bounding_box_zyx[0]).all(axis=1) keep &= (coords_df[['z', 'y', 'x']] < self.bounding_box_zyx[1]).all(axis=1) coords_df = coords_df.loc[keep] return coords_df[['z', 'y', 'x', 'label']]
def main(): configure_default_logging() parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '--use-mapping', action='store_true', help= 'Use in-memory map + /exists instead of /missing, as described in the general help text above.' ) parser.add_argument( '--output', '-o', default='missing-from-tsv.csv', help='Where to write the output CSV (default: missing-from-tsv.csv)') parser.add_argument( '--kafka-timestamp', '-k', type=str, help='Alternative to providing your own bodies list.\n' 'Use the kafka log automatically determine the list of bodies that have changed after the given timestamp.\n' 'Examples: -k="2018-11-22" -k="2018-11-22 17:34:00"') parser.add_argument('server', help='dvid server, e.g. emdata3:8900') parser.add_argument( 'uuid', help= 'dvid node to analyze or "master" for the latest master branch uuid') parser.add_argument( 'tsv_instance', help="Name of a tarsupervoxels instance, e.g. segmentation_sv_meshes.\n" "Must be sync'd to a labelmap (segmentation) instance.") parser.add_argument( 'bodies_csv', nargs='?', help='CSV containing a column named "body", which will be read.\n' 'If no "body" column exists, the first column is used, regardless of the name.\n' '(Omit this arg if you are using --kafka-timestamp)') args = parser.parse_args() if not (bool(args.kafka_timestamp) ^ bool(args.bodies_csv)): print( "You must provide either --kafka-timestamp or a bodies list (not both)", file=sys.stderr) sys.exit(1) if args.uuid == "master": args.uuid = find_master(args.server) # Determine segmentation instance info = fetch_instance_info(args.server, args.uuid, args.tsv_instance) seg_instance = info["Base"]["Syncs"][0] kafka_msgs = None if args.bodies_csv: if 'body' in read_csv_header(args.bodies_csv): bodies = pd.read_csv(args.bodies_csv)['body'].drop_duplicates() else: # Just read the first column, no matter what it's named bodies = read_csv_col(args.bodies_csv, 0, np.uint64).drop_duplicates() elif args.kafka_timestamp: # Validate timestamp format before fetching kafka log, which takes a while. parse_timestamp(args.kafka_timestamp) kafka_msgs = read_kafka_messages(args.server, args.uuid, seg_instance) filtered_kafka_msgs = filter_kafka_msgs_by_timerange( kafka_msgs, min_timestamp=args.kafka_timestamp) new_bodies, changed_bodies, _removed_bodies, new_supervoxels, _deleted_svs = compute_affected_bodies( filtered_kafka_msgs) sv_split_bodies = set( fetch_mapping(args.server, args.uuid, seg_instance, new_supervoxels)) - set([0]) bodies = set(chain(new_bodies, changed_bodies, sv_split_bodies)) bodies = np.fromiter(bodies, np.uint64) bodies.sort() else: raise AssertionError("Shouldn't get here.") if args.use_mapping: missing_entries = check_tarsupervoxels_status_via_exists( args.server, args.uuid, args.tsv_instance, bodies, seg_instance, kafka_msgs=kafka_msgs) else: missing_entries = check_tarsupervoxels_status_via_missing( args.server, args.uuid, args.tsv_instance, bodies) logger.info(f"Writing to {args.output}") missing_entries.to_csv(args.output, index=True, header=True) logging.info("DONE")
def generate_mergereview_assignments_from_df(server, uuid, instance, mr_fragments_df, bois, assignment_size, output_dir, single_file=False): """ Generate a set of assignments for the given mergereview fragments. The assignments are written to a nested hierarchy: Grouped first by task size (number of bodies in each task), and then grouped in batches of N tasks (assignment_size). The body IDs emitted in the assignments and their classification as "BOI" or not is determined by fetching the mappings for each supervoxel in the dataframe. """ # Sort table by task size (edge count) group_sizes = mr_fragments_df.groupby(['group_cc', 'cc_task' ]).size().rename('group_size') mr_fragments_df = mr_fragments_df.merge(group_sizes, 'left', left_on=['group_cc', 'cc_task'], right_index=True) mr_fragments_df = mr_fragments_df.sort_values( ['group_size', 'group_cc', 'cc_task']) mr_fragments_df['body_a'] = fetch_mapping(server, uuid, instance, mr_fragments_df['sv_a']) mr_fragments_df['body_b'] = fetch_mapping(server, uuid, instance, mr_fragments_df['sv_b']) mr_fragments_df['is_boi_a'] = mr_fragments_df.eval('body_a in @bois') mr_fragments_df['is_boi_b'] = mr_fragments_df.eval('body_b in @bois') # Group assignments by task size and emit an assignment for each group all_tasks = {} for group_size, same_size_tasks_df in mr_fragments_df.groupby( 'group_size'): group_tasks = [] for (group_cc, cc_task), task_df in same_size_tasks_df.groupby( ['group_cc', 'cc_task']): svs = pd.unique(task_df[['sv_a', 'sv_b']].values.reshape(-1)) svs = np.sort(svs) boi_svs = set(task_df[task_df['is_boi_a']]['sv_a'].tolist()) boi_svs |= set(task_df[task_df['is_boi_b']]['sv_b'].tolist()) task_bodies = pd.unique(task_df[['body_a', 'body_b' ]].values.reshape(-1)).tolist() task = { # neu3 fields 'task type': "merge review", 'task id': hex(zlib.crc32(svs)), 'supervoxel IDs': svs.tolist(), 'boi supervoxel IDs': sorted(boi_svs), # Encode edge table as json "supervoxel IDs A": task_df['sv_a'].tolist(), "supervoxel IDs B": task_df['sv_b'].tolist(), "supervoxel points A": task_df[['xa', 'ya', 'za']].values.tolist(), "supervoxel points B": task_df[['xb', 'yb', 'zb']].values.tolist(), # Debugging fields 'group_cc': int(group_cc), 'cc_task': int(cc_task), 'original_bodies': sorted(task_bodies), 'total_body_count': len(task_bodies), 'original_uuid': uuid, } group_tasks.append(task) num_bodies = group_size + 1 all_tasks[num_bodies] = group_tasks if single_file: # In single-file mode, the 'output_dir' is interpreted as the assignment path assert output_dir.endswith('.json') output_path = output_dir assignment = { "file type": "Neu3 task list", "file version": 1, "task list": list(chain(*all_tasks.values())) } with open(output_path, 'w') as f: #json.dump(assignment, f, indent=2) pretty_print_assignment_json_items(assignment.items(), f) else: # Now that the task json data has been generated and split into groups (by body count), # write them into multiple directories (one per group), each of which has muliple files # (one per task batch, as specified by assignment_size) for num_bodies, group_tasks in all_tasks.items(): output_subdir = f'{output_dir}/{num_bodies:02}-bodies' os.makedirs(output_subdir, exist_ok=True) for i, batch_start in enumerate( tqdm_proxy(range(0, len(group_tasks), assignment_size), leave=False)): output_path = f"{output_dir}/{num_bodies:02}-bodies/assignment-{i:04d}.json" batch_tasks = group_tasks[batch_start:batch_start + assignment_size] assignment = { "file type": "Neu3 task list", "file version": 1, "task list": batch_tasks } with open(output_path, 'w') as f: #json.dump(assignment, f, indent=2) pretty_print_assignment_json_items(assignment.items(), f) return all_tasks
if with_labels: labels = {n: str(g.nodes[n]['body']) for n in g.nodes} else: labels = None print(f"Drawing {len(g.nodes)} nodes and {len(g.edges)} edges") p = draw(g, node_color=node_colors, edge_color=edge_color, labels=labels, pos=nx.kamada_kawai_layout(g), with_labels=with_labels) if hv: p = p.opts(plot=dict(width=width, height=width)) return p if __name__ == "__main__": df = pd.DataFrame( np.load('/tmp/philip_small_not_tiny_df.npy', allow_pickle=True)) df['body_a'] = fetch_mapping('emdata4:8900', '28e6', 'segmentation', df['sv_a']) df['body_b'] = fetch_mapping('emdata4:8900', '28e6', 'segmentation', df['sv_b']) bois = set(df[['body_a', 'body_b']].values.reshape(-1)) generate_mergereview_assignments_from_df('emdata4:8900', '28e6', 'segmentation', df, bois, 10, '/tmp/philip-assignments')
def fetch_roi_synapses(server, uuid, synapses_instance, rois, fetch_labels=False, return_partners=False, processes=16): """ Fetch the coordinates and (optionally) body labels for all synapses that fall within the given ROIs. Args: server: DVID server, e.g. 'emdata4:8900' uuid: DVID uuid, e.g. 'abc9' synapses_instance: DVID synapses instance name, e.g. 'synapses' rois: A single DVID ROI instance names or a list of them, e.g. 'EB' or ['EB', 'FB'] fetch_labels: If True, also fetch the supervoxel and body label underneath each synapse, returned in columns 'sv' and 'body'. return_partners: If True, also return the partners table. processes: How many parallel processes to use when fetching synapses and supervoxel labels. Returns: pandas DataFrame with columns: ``['z', 'y', 'x', 'kind', 'conf']`` and ``['sv', 'body']`` (if ``fetch_labels=True``) If return_partners is True, also return the partners table. Example: df = fetch_roi_synapses('emdata4:8900', '3c281', 'synapses', ['PB(L5)', 'PB(L7)'], True, 8) """ # Late imports to avoid circular imports in dvid/__init__ from neuclease.dvid import fetch_combined_roi_volume, determine_point_rois, fetch_labels_batched, fetch_mapping, fetch_mappings assert rois, "No rois provided, result would be empty. Is that what you meant?" if isinstance(rois, str): rois = [rois] # Determine name of the segmentation instance that's # associated with the given synapses instance. syn_info = fetch_instance_info(server, uuid, synapses_instance) seg_instance = syn_info["Base"]["Syncs"][0] logger.info(f"Fetching mask for ROIs: {rois}") # Fetch the ROI as a low-res array (scale 5, i.e. 32-px resolution) roi_vol_s5, roi_box_s5, overlapping_pairs = fetch_combined_roi_volume( server, uuid, rois) if len(overlapping_pairs) > 0: logger.warning( "Some ROIs overlapped and are thus not completely represented in the output:\n" f"{overlapping_pairs}") # Convert to full-res box roi_box = (2**5) * roi_box_s5 # fetch_synapses_in_batches() requires a box that is 64-px-aligned roi_box = round_box(roi_box, 64, 'out') logger.info("Fetching synapse points") # points_df is a DataFrame with columns for [z,y,x] points_df, partners_df = fetch_synapses_in_batches(server, uuid, synapses_instance, roi_box, processes=processes) # Append a 'roi_name' column to points_df logger.info("Labeling ROI for each point") determine_point_rois(server, uuid, rois, points_df, roi_vol_s5, roi_box_s5) logger.info("Discarding points that don't overlap with the roi") rois = {*rois} points_df = points_df.query('roi in @rois').copy() columns = ['z', 'y', 'x', 'kind', 'conf', 'roi_label', 'roi'] if fetch_labels: logger.info("Fetching supervoxel under each point") svs = fetch_labels_batched(server, uuid, seg_instance, points_df[['z', 'y', 'x']].values, supervoxels=True, processes=processes) with Timer("Mapping supervoxels to bodies", logger): # Arbitrary heuristic for whether to do the # body-lookups on DVID or on the client. if len(svs) < 100_000: bodies = fetch_mapping(server, uuid, seg_instance, svs) else: mapping = fetch_mappings(server, uuid, seg_instance) mapper = LabelMapper(mapping.index.values, mapping.values) bodies = mapper.apply(svs, True) points_df['sv'] = svs points_df['body'] = bodies columns += ['body', 'sv'] if return_partners: # Filter #partners_df = partners_df.query('post_id in @points_df.index and pre_id in @points_df.index').copy() # Faster filter (via merge) partners_df = partners_df.merge(points_df[[]], 'inner', left_on='pre_id', right_index=True) partners_df = partners_df.merge(points_df[[]], 'inner', left_on='post_id', right_index=True) return points_df[columns], partners_df else: return points_df[columns]