Ejemplo n.º 1
0
def test_post_hierarchical_cleaves(labelmap_setup):
    dvid_server, dvid_repo, _merge_table_path, _mapping_path, _supervoxel_vol = labelmap_setup

    uuid = post_branch(dvid_server, dvid_repo,
                       'segmentation-post_hierarchical_cleaves', '')
    instance_info = dvid_server, uuid, 'segmentation-post_hierarchical_cleaves'
    create_labelmap_instance(*instance_info)

    svs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    groups = [1, 1, 2, 2, 3, 3, 3, 3, 3, 4]

    svs = np.asarray(svs, np.uint64)

    # Post some supervoxels in multiple blocks, just to prove that post_hierarchical_cleaves()
    # doesn't assume that the labelindex has the same length as the mapping.
    sv_vol = np.zeros((128, 64, 64), np.uint64)
    sv_vol[0, 0, :len(svs)] = svs
    sv_vol[64, 0, 0:len(svs):2] = svs[::2]

    post_labelmap_voxels(*instance_info, (0, 0, 0), sv_vol)

    post_merge(*instance_info, 1, svs[1:])

    group_mapping = pd.Series(index=svs, data=groups)
    final_table = post_hierarchical_cleaves(*instance_info, 1, group_mapping)

    assert (fetch_mapping(*instance_info,
                          svs) == final_table['body'].values).all()
    assert (final_table.drop_duplicates(
        ['group']) == final_table.drop_duplicates(['group',
                                                   'body'])).all().all()
    assert (final_table.drop_duplicates(
        ['body']) == final_table.drop_duplicates(['group',
                                                  'body'])).all().all()

    # Since the mapping included all supervoxels in the body,
    # the last group is left with the original label.
    assert final_table.iloc[-1]['body'] == 1

    # Now merge them all together and try again, but leave
    # two supevoxels out of the groups this time.
    merges = set(pd.unique(final_table['body'].values)) - set([1])
    post_merge(*instance_info, 1, list(merges))

    group_mapping = pd.Series(index=svs[:-2], data=groups[:-2])
    final_table = post_hierarchical_cleaves(*instance_info, 1, group_mapping)

    assert len(
        final_table.query('body == 1')
    ) == 0, "Did not expect any of the groups to retain the original body ID!"
    assert (fetch_mapping(*instance_info,
                          svs[:-2]) == final_table['body'].values).all()
    assert (final_table.drop_duplicates(
        ['group']) == final_table.drop_duplicates(['group',
                                                   'body'])).all().all()
    assert (final_table.drop_duplicates(
        ['body']) == final_table.drop_duplicates(['group',
                                                  'body'])).all().all()
    assert (fetch_mapping(*instance_info, svs[-2:]) == 1).all()
Ejemplo n.º 2
0
def cleave_supervoxels_as_isolated_bodies(instance_info, sv_ids):
    """
    Separate the given supervoxels from their enclosing bodies by
    cleaving them out as their own independent, single-supervoxel bodies.
    """
    logger.info("Fetching mapping for each SV")
    body_ids = fetch_mapping(*instance_info, sv_ids, as_series=True)

    logger.info("Performing cleaves")
    cleaved_ids = []
    for sv_id, body_id in tqdm(list(zip(sv_ids, body_ids))):
        try:
            cleaved_body = post_cleave(*instance_info, body_id, [sv_id])
        except requests.RequestException as ex:
            if 'cannot cleave all supervoxels from the label' in ex.response.content.decode(
            ):
                # Body has only one supervoxel to begin with
                cleaved_body = body_id
            else:
                sys.stderr.write(ex.response.content.decode())
                raise

        cleaved_ids.append(cleaved_body)

    return list(zip(sv_ids, body_ids, cleaved_ids))
Ejemplo n.º 3
0
def main():
    # Hard-coded parameters
    prod = 'emdata4:8900'
    master = (prod, find_master(prod))
    master_seg = (*master, 'segmentation')

    # I accidentally corrupted the labelindex of bodies in this region
    patch_box = 20480 + np.array([[0, 0, 0], [1024, 1024, 1024]])

    with Timer("Fetching supervoxels", logger):
        boxes = boxes_from_grid(patch_box, Grid((64, 64, 6400)), clipped=True)
        sv_sets = compute_parallel(partial(_fetch_svs, master_seg),
                                   boxes,
                                   processes=32,
                                   ordered=False,
                                   leave_progress=True)
        svs = set(chain(*sv_sets)) - set([0])

    bodies = set(fetch_mapping(*master_seg, svs))

    with Timer(f"Repairing {len(bodies)} labelindexes", logger):
        compute_parallel(partial(_repair_index, master_seg),
                         bodies,
                         processes=32,
                         ordered=False,
                         leave_progress=True)

    print("DONE.")
Ejemplo n.º 4
0
def extract_body_ids_and_launch(c, args, seg_instance, body_csv, msgs_df):
    """
    Extract the list of body IDs from the given kafka messages,
    overwrite the body list CSV file in the workflow template directory,
    and submit a cluster job to launch the workflow.
    """
    # Late imports so --help works quickly
    import numpy as np
    import pandas as pd
    from neuclease.dvid import resolve_ref, fetch_mapping, compute_affected_bodies

    if len(msgs_df) == 0:
        return False

    exclude_bodies = load_bad_bodies()

    # If the uuid was specified as a branch,
    # resolve it to a specific uuid now.
    server, uuid, instance = seg_instance
    uuid = resolve_ref(server, uuid)

    # Extract all bodies and supervoxels that have been touched in the kafka log
    new_bodies, changed_bodies, _removed_bodies, new_supervoxels = compute_affected_bodies(
        msgs_df['msg'])

    # For touched supervoxels, we need to find their mapped bodies.
    sv_split_bodies = set(
        fetch_mapping(server, uuid, instance, new_supervoxels)) - set([0])

    subset_bodies = set(chain(new_bodies, changed_bodies, sv_split_bodies))
    subset_bodies -= set(exclude_bodies)
    subset_bodies = np.fromiter(subset_bodies, np.uint64)
    subset_bodies = np.sort(subset_bodies).tolist()

    if len(subset_bodies) == 0:
        return False

    # Overwrite the CSV file for the workflow's subset-bodies set.
    pd.Series(subset_bodies,
              name='body').to_csv(f'{args.template_dir}/{body_csv}',
                                  header=True,
                                  index=False)

    first_timestamp = msgs_df['timestamp'].iloc[0]
    last_timestamp = msgs_df['timestamp'].iloc[-1]

    logger.info(f"Launching mesh computation for {len(subset_bodies)} bodies, "
                f"modified between [{first_timestamp}] and [{last_timestamp}]")

    # FIXME: Instead of hard-coding -W to one hour, read the template dask-config.yaml
    cmd = (
        f"source $({args.conda_path} info --base)/bin/activate {args.conda_env} "
        f"&& cd {args.cwd} "
        f"&& bsub -W 01:00 -n {args.driver_slots} -o /dev/null launchflow -n {args.worker_slots} {args.template_dir}"
    )

    run_cmd(c, cmd)
    return True
Ejemplo n.º 5
0
def adjust_focused_points(server,
                          uuid,
                          instance,
                          assignment_json_data,
                          supervoxels=True,
                          max_search_scale=3):
    new_assignment_data = copy.deepcopy(assignment_json_data)
    new_tasks = new_assignment_data["task list"]

    for task in tqdm_proxy(new_tasks):
        sv_1 = task["supervoxel ID 1"]
        sv_2 = task["supervoxel ID 2"]

        coord_1 = np.array(task["supervoxel point 1"])
        coord_2 = np.array(task["supervoxel point 2"])

        if supervoxels:
            label_1 = sv_1
            label_2 = sv_2
        else:
            label_1, label_2 = fetch_mapping(server,
                                             uuid,
                                             instance, [sv_1, sv_2],
                                             as_series=True)

        avg_coord = (coord_1 + coord_2) // 2

        # Search until we find a scale in which the two touch, or give up.
        for scale in range(1 + max_search_scale):
            box_xyz = (avg_coord // (2**scale) - 64,
                       avg_coord // (2**scale) + 64)
            box_zyx = np.array(box_xyz)[:, ::-1]
            seg_vol = fetch_labelarray_voxels(server,
                                              uuid,
                                              instance,
                                              box_zyx,
                                              scale,
                                              supervoxels=supervoxels)

            adjusted_coords_zyx = find_best_plane(seg_vol, label_1, label_2)
            adjusted_coords_zyx = np.array(adjusted_coords_zyx)

            if not (adjusted_coords_zyx == -1).all():
                # Found it.
                adjusted_coords_zyx += box_zyx[0]
                adjusted_coords_zyx *= (2**scale)
                break

        if (adjusted_coords_zyx == -1).all():
            task["coordinate-status"] = "misplaced"
        else:
            task["supervoxel point 1"] = adjusted_coords_zyx[0, ::-1].tolist()
            task["supervoxel point 2"] = adjusted_coords_zyx[1, ::-1].tolist()
            task["coordinate-status"] = f"adjusted-at-scale-{scale}"

    return new_assignment_data
    def determine_changed_labelmap_bodies(self, kafka_timestamp_string):
        """
        Read the entire labelmap kafka log, and determine
        which bodies have changed since the given timestamp (a string).

        Example timestamps:
            - "2018-11-22"
            - "2018-11-22 17:34:00"

        Returns:
            list of body IDs
        """
        logger.info(
            f"Determining which bodies have changed since {kafka_timestamp_string}"
        )

        try:
            kafka_timestamp = parse_timestamp(kafka_timestamp_string)
        except:
            raise RuntimeError(
                f"Could not parse your subset-bodies config setting ({kafka_timestamp_string}) "
                "as either a body list or a kafka timestamp")

        seg_instance = self.instance_triple

        kafka_msgs = read_kafka_messages(*seg_instance)
        filtered_kafka_msgs = filter_kafka_msgs_by_timerange(
            kafka_msgs, min_timestamp=kafka_timestamp)

        new_bodies, changed_bodies, _removed_bodies, new_supervoxels = compute_affected_bodies(
            filtered_kafka_msgs)
        sv_split_bodies = set(fetch_mapping(*seg_instance,
                                            new_supervoxels)) - set([0])

        subset_bodies = set(chain(new_bodies, changed_bodies, sv_split_bodies))
        subset_bodies = np.fromiter(subset_bodies, np.uint64)
        subset_bodies = np.sort(subset_bodies).tolist()

        logger.info(
            f"The kafka log shows that {len(subset_bodies)} bodies have changed since ({kafka_timestamp_string})"
        )
        return subset_bodies
def test_masksegmentation_basic(setup_dvid_segmentation_input, invert_mask,
                                roi_dilation, disable_auto_retry):
    template_dir, config, volume, dvid_address, repo_uuid, roi_mask_s5, input_segmentation_name, output_segmentation_name = setup_dvid_segmentation_input

    if invert_mask:
        roi_mask_s5 = ~roi_mask_s5

    config["masksegmentation"]["invert-mask"] = invert_mask
    config["masksegmentation"]["dilate-roi"] = roi_dilation

    # re-dump config
    yaml = YAML()
    yaml.default_flow_style = False
    with open(f"{template_dir}/workflow.yaml", 'w') as f:
        yaml.dump(config, f)

    execution_dir, workflow = launch_flow(template_dir, 1)
    final_config = workflow.config

    input_box_xyz = np.array(final_config['input']['geometry']['bounding-box'])
    input_box_zyx = input_box_xyz[:, ::-1]

    roi_mask = upsample(roi_mask_s5, 2**5)
    roi_mask = extract_subvol(roi_mask, input_box_zyx)

    expected_vol = extract_subvol(volume.copy(), input_box_zyx)
    expected_vol[roi_mask] = 0

    output_box_xyz = np.array(
        final_config['output']['geometry']['bounding-box'])
    output_box_zyx = output_box_xyz[:, ::-1]
    output_vol = fetch_labelmap_voxels(dvid_address,
                                       repo_uuid,
                                       output_segmentation_name,
                                       output_box_zyx,
                                       scale=0,
                                       supervoxels=True)

    # Create a copy of the volume that contains only the voxels we removed
    erased_vol = volume.copy()
    erased_vol[~roi_mask] = 0

    if EXPORT_DEBUG_FILES:
        original_vol = fetch_labelmap_voxels(dvid_address,
                                             repo_uuid,
                                             input_segmentation_name,
                                             output_box_zyx,
                                             scale=0,
                                             supervoxels=True)
        original_agglo_vol = fetch_labelmap_voxels(dvid_address,
                                                   repo_uuid,
                                                   input_segmentation_name,
                                                   output_box_zyx,
                                                   scale=0)
        output_agglo_vol = fetch_labelmap_voxels(dvid_address,
                                                 repo_uuid,
                                                 output_segmentation_name,
                                                 output_box_zyx,
                                                 scale=0)
        np.save('/tmp/original-svs.npy', original_vol)
        np.save('/tmp/original-agglo.npy', original_agglo_vol)
        np.save('/tmp/output.npy', output_vol)
        np.save('/tmp/output-agglo.npy', output_agglo_vol)
        np.save('/tmp/expected.npy', expected_vol)
        np.save('/tmp/erased.npy', erased_vol)

        shutil.copyfile(f'{execution_dir}/roi-mask.h5', '/tmp/roi-mask.h5')
        if roi_dilation:
            shutil.copyfile(f'{execution_dir}/dilated-roi-mask.h5',
                            '/tmp/dilated-roi-mask.h5')
        if invert_mask:
            shutil.copyfile(f'{execution_dir}/segmentation-mask.h5',
                            '/tmp/segmentation-mask.h5')
        shutil.copyfile(f'{execution_dir}/final-mask.h5', '/tmp/final-mask.h5')

    if roi_dilation > 0:
        # FIXME: We don't yet verify voxel-accuracy of ROI dilation.
        return

    assert (output_vol == expected_vol).all(), \
        "Written vol does not match expected"

    scaled_expected_vol = expected_vol
    for scale in range(1, 1 + MAX_SCALE):
        scaled_expected_vol = downsample(scaled_expected_vol, 2,
                                         'labels-numba')
        scaled_output_vol = fetch_labelmap_voxels(dvid_address,
                                                  repo_uuid,
                                                  output_segmentation_name,
                                                  output_box_zyx // 2**scale,
                                                  scale=scale,
                                                  supervoxels=True)

        if EXPORT_DEBUG_FILES:
            np.save(f'/tmp/expected-{scale}.npy', scaled_expected_vol)
            np.save(f'/tmp/expected-{scale}.npy', scaled_expected_vol)
            np.save(f'/tmp/output-{scale}.npy', scaled_output_vol)

        if scale <= 5:
            assert (scaled_output_vol == scaled_expected_vol).all(), \
                f"Written vol does not match expected at scale {scale}"
        else:
            # For scale 6 and 7, some blocks are not even changed,
            # but that means we would be comparing DVID's label
            # downsampling method to our method ('labels-numba').
            # The two don't necessarily give identical results in the case of 'ties',
            # so we'll just verify that the nonzero voxels match, at least.
            assert ((scaled_output_vol == 0) == (scaled_expected_vol == 0)).all(), \
                f"Written vol does not match expected at scale {scale}"

    block_stats_path = f'{execution_dir}/erased-block-statistics.h5'
    with h5py.File(block_stats_path, 'r') as f:
        stats_df = pd.DataFrame(f['stats'][:])

    #
    # Check the exported block statistics
    #
    stats_cols = [*BLOCK_STATS_DTYPES.keys()]
    assert stats_df.columns.tolist() == stats_cols
    stats_df = stats_df.sort_values(stats_cols).reset_index()

    expected_stats_df = block_stats_for_volume((64, 64, 64), erased_vol,
                                               input_box_zyx)
    expected_stats_df = expected_stats_df.sort_values(stats_cols).reset_index()

    assert len(stats_df) == len(expected_stats_df)
    assert (stats_df == expected_stats_df).all().all()

    #
    # Try updating the labelindexes
    #
    src_info = (dvid_address, repo_uuid, input_segmentation_name)
    dest_info = (dvid_address, repo_uuid, output_segmentation_name)
    with switch_cwd(execution_dir):
        erase_from_labelindexes(src_info,
                                dest_info,
                                block_stats_path,
                                batch_size=10,
                                threads=4)

    # Verify deleted supervoxels
    assert os.path.exists(f'{execution_dir}/deleted-supervoxels.csv')
    deleted_svs = set(
        pd.read_csv(f'{execution_dir}/deleted-supervoxels.csv')['sv'])

    orig_svs = {*pd.unique(volume.reshape(-1))} - {0}
    remaining_svs = {*pd.unique(expected_vol.reshape(-1))} - {0}
    expected_deleted_svs = orig_svs - remaining_svs
    assert deleted_svs == expected_deleted_svs

    # Verify remaining sizes
    expected_sv_counts = (pd.Series(
        expected_vol.reshape(-1),
        name='sv').value_counts().drop(0).sort_index().rename('count'))

    index_dfs = []
    for body in np.unique(fetch_mapping(*dest_info, remaining_svs)):
        index_df = fetch_labelindex(*dest_info, body, format='pandas').blocks
        index_dfs.append(index_df)

    sv_counts = (pd.concat(index_dfs, ignore_index=True)[[
        'sv', 'count'
    ]].groupby('sv')['count'].sum().sort_index())
    assert set(sv_counts.index.values) == set(expected_sv_counts.index.values)
    assert (sv_counts == expected_sv_counts).all(), \
        pd.DataFrame({'stored_count': sv_counts, 'expected_count': expected_sv_counts}).query('stored_count != expected_count')

    # Verify mapping
    # Deleted supervoxels exist in the mapping, but they map to 0.
    assert (fetch_mapping(*dest_info, [*deleted_svs]) == 0).all()

    # Remaining supervoxels still map to their original bodies
    assert (fetch_mapping(*dest_info, [*remaining_svs]) == fetch_mapping(
        *src_info, [*remaining_svs])).all()
    def sparse_brick_coords_for_labels(self, labels, clip=True):
        """
        Return a DataFrame indicating the brick
        coordinates (starting corner) that encompass the given labels.

        Args:
            labels:
                A list of body IDs (if ``self.supervoxels`` is False),
                or supervoxel IDs (if ``self.supervoxels`` is True).

            clip:
                If True, filter the results to exclude any coordinates
                that fall outside this service's bounding-box.
                Otherwise, all brick coordinates that encompass the given labels
                will be returned, whether or not they fall within the bounding box.

        Returns:
            DataFrame with columns [z,y,x,label],
            where z,y,x represents the starting corner (in full-res coordinates)
            of a brick that contains the label.
        """
        assert not isinstance(labels,
                              set), "Pass labels as a list or array, not a set"
        labels = pd.unique(labels)
        is_supervoxels = self.supervoxels
        brick_shape = self.preferred_message_shape
        assert (brick_shape % self.block_width == 0).all(), \
            ("Brick shape ('preferred-message-shape') must be a multiple of the "
             f"block width ({self.block_width}) in all dimensions, not {brick_shape}")

        bad_labels = []

        if not is_supervoxels:
            # No supervoxel filtering.
            # Sort by body, since that should be slightly nicer for dvid performance.
            bodies_and_svs = {label: None for label in sorted(labels)}
        else:
            # Arbitrary heuristic for whether to do the body-lookups on DVID or on the client.
            if len(labels) < 100_000:
                # If we're only dealing with a few supervoxels,
                # ask dvid to map them to bodies for us.
                mapping = fetch_mapping(*self.instance_triple,
                                        labels,
                                        as_series=True)
            else:
                # If we're dealing with a lot of supervoxels, ask for
                # the entire mapping, and look up the bodies ourselves.
                complete_mapping = fetch_mappings(*self.instance_triple)
                mapper = LabelMapper(complete_mapping.index.values,
                                     complete_mapping.values)

                labels = np.asarray(labels, np.uint64)
                bodies = mapper.apply(labels, True)
                mapping = pd.Series(index=labels, data=bodies, name='body')
                mapping.index.rename('sv', inplace=True)

            bad_svs = mapping[mapping == 0]
            bad_labels.extend(bad_svs.index.tolist())

            # Group by body
            mapping = mapping[mapping != 0]
            grouped_svs = mapping.reset_index().groupby('body').agg(
                {'sv': list})['sv']

            # Sort by body, since that should be slightly nicer for dvid performance.
            bodies_and_svs = grouped_svs.sort_index().to_dict()

        # Extract these to avoid pickling 'self' (just for speed)
        server, uuid, instance = self.instance_triple
        if self._use_resource_manager_for_sparse_coords:
            mgr = self.resource_manager_client
        else:
            mgr = ResourceManagerClient("", 0)

        def fetch_brick_coords(body, supervoxel_subset):
            """
            Fetch the block coordinates for the given body,
            filter them for the given supervoxels (if any),
            and convert the block coordinates to brick coordinates.
            """
            assert is_supervoxels or supervoxel_subset is None

            try:
                with mgr.access_context(server, True, 1, 1):
                    labelindex = fetch_labelindex(server, uuid, instance, body,
                                                  'protobuf')
                coords_df = convert_labelindex_to_pandas(labelindex).blocks

            except HTTPError as ex:
                if (ex.response is not None
                        and ex.response.status_code == 404):
                    return (body, None)
                raise
            except RuntimeError as ex:
                if 'does not map to any body' in str(ex):
                    return (body, None)
                raise

            if len(coords_df) == 0:
                return (body, None)

            if is_supervoxels:
                supervoxel_subset = set(supervoxel_subset)
                coords_df = coords_df.query('sv in @supervoxel_subset').copy()

            coords_df[['z', 'y', 'x']] //= brick_shape
            coords_df['body'] = np.uint64(body)
            coords_df.drop_duplicates(inplace=True)
            return (body, coords_df)

        def fetch_and_concatenate_brick_coords(bodies_and_supervoxels):
            """
            To reduce the number of tiny DataFrames collected to the driver,
            it's best to concatenate the partitions first, on the workers,
            rather than a straightforward call to starmap(fetch_brick_coords).

            Hence, this function that consolidates each partition.
            """
            bad_bodies = []
            coord_dfs = []
            for (body, supervoxel_subset) in bodies_and_supervoxels:
                _, coords_df = fetch_brick_coords(body, supervoxel_subset)
                if coords_df is None:
                    bad_bodies.append(body)
                else:
                    coord_dfs.append(coords_df)
                    del coords_df

            if coord_dfs:
                return [(pd.concat(coord_dfs, ignore_index=True), bad_bodies)]
            else:
                return [(None, bad_bodies)]

        with Timer(
                f"Fetching coarse sparsevols for {len(labels)} labels ({len(bodies_and_svs)} bodies)",
                logger=logger):
            import dask.bag as db
            coords_and_bad_bodies = (
                db.from_sequence(
                    bodies_and_svs.items(), npartitions=4096
                )  # Instead of fancy heuristics, just pick 4096
                .map_partitions(fetch_and_concatenate_brick_coords).compute())

        coords_df_partitions, bad_body_partitions = zip(*coords_and_bad_bodies)

        for body in chain(*bad_body_partitions):
            if is_supervoxels:
                bad_labels.extend(bodies_and_svs[body])
            else:
                bad_labels.append(body)

        if bad_labels:
            name = 'sv' if is_supervoxels else 'body'
            pd.Series(bad_labels,
                      name=name).to_csv('labels-without-sparsevols.csv',
                                        index=False,
                                        header=True)
            if len(bad_labels) < 100:
                msg = f"Could not obtain coarse sparsevol for {len(bad_labels)} labels: {bad_labels}"
            else:
                msg = f"Could not obtain coarse sparsevol for {len(bad_labels)} labels. See labels-without-sparsevols.csv"

            logger.warning(msg)

        coords_df_partitions = list(
            filter(lambda df: df is not None, coords_df_partitions))
        if len(coords_df_partitions) == 0:
            raise RuntimeError(
                "Could not find bricks for any of the given labels")

        coords_df = pd.concat(coords_df_partitions, ignore_index=True)

        if self.supervoxels:
            coords_df['label'] = coords_df['sv']
        else:
            coords_df['label'] = coords_df['body']

        coords_df.drop_duplicates(['z', 'y', 'x', 'label'], inplace=True)
        coords_df[['z', 'y', 'x']] *= brick_shape

        if clip:
            # Keep if the last pixel in the brick is to the right of the bounding-box start
            # and the first pixel in the brick is to the left of the bounding-box stop
            keep = (coords_df[['z', 'y', 'x']] + brick_shape >
                    self.bounding_box_zyx[0]).all(axis=1)
            keep &= (coords_df[['z', 'y', 'x']] <
                     self.bounding_box_zyx[1]).all(axis=1)
            coords_df = coords_df.loc[keep]

        return coords_df[['z', 'y', 'x', 'label']]
Ejemplo n.º 9
0
def main():
    configure_default_logging()

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '--use-mapping',
        action='store_true',
        help=
        'Use in-memory map + /exists instead of /missing, as described in the general help text above.'
    )
    parser.add_argument(
        '--output',
        '-o',
        default='missing-from-tsv.csv',
        help='Where to write the output CSV (default: missing-from-tsv.csv)')

    parser.add_argument(
        '--kafka-timestamp',
        '-k',
        type=str,
        help='Alternative to providing your own bodies list.\n'
        'Use the kafka log automatically determine the list of bodies that have changed after the given timestamp.\n'
        'Examples: -k="2018-11-22" -k="2018-11-22 17:34:00"')

    parser.add_argument('server', help='dvid server, e.g. emdata3:8900')
    parser.add_argument(
        'uuid',
        help=
        'dvid node to analyze or "master" for the latest master branch uuid')
    parser.add_argument(
        'tsv_instance',
        help="Name of a tarsupervoxels instance, e.g. segmentation_sv_meshes.\n"
        "Must be sync'd to a labelmap (segmentation) instance.")
    parser.add_argument(
        'bodies_csv',
        nargs='?',
        help='CSV containing a column named "body", which will be read.\n'
        'If no "body" column exists, the first column is used, regardless of the name.\n'
        '(Omit this arg if you are using --kafka-timestamp)')
    args = parser.parse_args()

    if not (bool(args.kafka_timestamp) ^ bool(args.bodies_csv)):
        print(
            "You must provide either --kafka-timestamp or a bodies list (not both)",
            file=sys.stderr)
        sys.exit(1)

    if args.uuid == "master":
        args.uuid = find_master(args.server)

    # Determine segmentation instance
    info = fetch_instance_info(args.server, args.uuid, args.tsv_instance)
    seg_instance = info["Base"]["Syncs"][0]

    kafka_msgs = None
    if args.bodies_csv:
        if 'body' in read_csv_header(args.bodies_csv):
            bodies = pd.read_csv(args.bodies_csv)['body'].drop_duplicates()
        else:
            # Just read the first column, no matter what it's named
            bodies = read_csv_col(args.bodies_csv, 0,
                                  np.uint64).drop_duplicates()
    elif args.kafka_timestamp:
        # Validate timestamp format before fetching kafka log, which takes a while.
        parse_timestamp(args.kafka_timestamp)

        kafka_msgs = read_kafka_messages(args.server, args.uuid, seg_instance)
        filtered_kafka_msgs = filter_kafka_msgs_by_timerange(
            kafka_msgs, min_timestamp=args.kafka_timestamp)

        new_bodies, changed_bodies, _removed_bodies, new_supervoxels, _deleted_svs = compute_affected_bodies(
            filtered_kafka_msgs)
        sv_split_bodies = set(
            fetch_mapping(args.server, args.uuid, seg_instance,
                          new_supervoxels)) - set([0])

        bodies = set(chain(new_bodies, changed_bodies, sv_split_bodies))
        bodies = np.fromiter(bodies, np.uint64)
        bodies.sort()
    else:
        raise AssertionError("Shouldn't get here.")

    if args.use_mapping:
        missing_entries = check_tarsupervoxels_status_via_exists(
            args.server,
            args.uuid,
            args.tsv_instance,
            bodies,
            seg_instance,
            kafka_msgs=kafka_msgs)
    else:
        missing_entries = check_tarsupervoxels_status_via_missing(
            args.server, args.uuid, args.tsv_instance, bodies)

    logger.info(f"Writing to {args.output}")
    missing_entries.to_csv(args.output, index=True, header=True)
    logging.info("DONE")
Ejemplo n.º 10
0
def generate_mergereview_assignments_from_df(server,
                                             uuid,
                                             instance,
                                             mr_fragments_df,
                                             bois,
                                             assignment_size,
                                             output_dir,
                                             single_file=False):
    """
    Generate a set of assignments for the given mergereview fragments.
    The assignments are written to a nested hierarchy:
    Grouped first by task size (number of bodies in each task),
    and then grouped in batches of N tasks (assignment_size).
    
    The body IDs emitted in the assignments and their classification as "BOI"
    or not is determined by fetching the mappings for each supervoxel in the dataframe.
    """
    # Sort table by task size (edge count)
    group_sizes = mr_fragments_df.groupby(['group_cc', 'cc_task'
                                           ]).size().rename('group_size')
    mr_fragments_df = mr_fragments_df.merge(group_sizes,
                                            'left',
                                            left_on=['group_cc', 'cc_task'],
                                            right_index=True)
    mr_fragments_df = mr_fragments_df.sort_values(
        ['group_size', 'group_cc', 'cc_task'])

    mr_fragments_df['body_a'] = fetch_mapping(server, uuid, instance,
                                              mr_fragments_df['sv_a'])
    mr_fragments_df['body_b'] = fetch_mapping(server, uuid, instance,
                                              mr_fragments_df['sv_b'])

    mr_fragments_df['is_boi_a'] = mr_fragments_df.eval('body_a in @bois')
    mr_fragments_df['is_boi_b'] = mr_fragments_df.eval('body_b in @bois')

    # Group assignments by task size and emit an assignment for each group
    all_tasks = {}
    for group_size, same_size_tasks_df in mr_fragments_df.groupby(
            'group_size'):
        group_tasks = []
        for (group_cc, cc_task), task_df in same_size_tasks_df.groupby(
            ['group_cc', 'cc_task']):
            svs = pd.unique(task_df[['sv_a', 'sv_b']].values.reshape(-1))
            svs = np.sort(svs)

            boi_svs = set(task_df[task_df['is_boi_a']]['sv_a'].tolist())
            boi_svs |= set(task_df[task_df['is_boi_b']]['sv_b'].tolist())

            task_bodies = pd.unique(task_df[['body_a', 'body_b'
                                             ]].values.reshape(-1)).tolist()

            task = {
                # neu3 fields
                'task type': "merge review",
                'task id': hex(zlib.crc32(svs)),
                'supervoxel IDs': svs.tolist(),
                'boi supervoxel IDs': sorted(boi_svs),

                # Encode edge table as json
                "supervoxel IDs A": task_df['sv_a'].tolist(),
                "supervoxel IDs B": task_df['sv_b'].tolist(),
                "supervoxel points A": task_df[['xa', 'ya',
                                                'za']].values.tolist(),
                "supervoxel points B": task_df[['xb', 'yb',
                                                'zb']].values.tolist(),

                # Debugging fields
                'group_cc': int(group_cc),
                'cc_task': int(cc_task),
                'original_bodies': sorted(task_bodies),
                'total_body_count': len(task_bodies),
                'original_uuid': uuid,
            }
            group_tasks.append(task)

        num_bodies = group_size + 1
        all_tasks[num_bodies] = group_tasks

    if single_file:
        # In single-file mode, the 'output_dir' is interpreted as the assignment path
        assert output_dir.endswith('.json')
        output_path = output_dir
        assignment = {
            "file type": "Neu3 task list",
            "file version": 1,
            "task list": list(chain(*all_tasks.values()))
        }
        with open(output_path, 'w') as f:
            #json.dump(assignment, f, indent=2)
            pretty_print_assignment_json_items(assignment.items(), f)
    else:
        # Now that the task json data has been generated and split into groups (by body count),
        # write them into multiple directories (one per group), each of which has muliple files
        # (one per task batch, as specified by assignment_size)
        for num_bodies, group_tasks in all_tasks.items():
            output_subdir = f'{output_dir}/{num_bodies:02}-bodies'
            os.makedirs(output_subdir, exist_ok=True)
            for i, batch_start in enumerate(
                    tqdm_proxy(range(0, len(group_tasks), assignment_size),
                               leave=False)):
                output_path = f"{output_dir}/{num_bodies:02}-bodies/assignment-{i:04d}.json"

                batch_tasks = group_tasks[batch_start:batch_start +
                                          assignment_size]
                assignment = {
                    "file type": "Neu3 task list",
                    "file version": 1,
                    "task list": batch_tasks
                }

                with open(output_path, 'w') as f:
                    #json.dump(assignment, f, indent=2)
                    pretty_print_assignment_json_items(assignment.items(), f)

    return all_tasks
Ejemplo n.º 11
0
    if with_labels:
        labels = {n: str(g.nodes[n]['body']) for n in g.nodes}
    else:
        labels = None

    print(f"Drawing {len(g.nodes)} nodes and {len(g.edges)} edges")

    p = draw(g,
             node_color=node_colors,
             edge_color=edge_color,
             labels=labels,
             pos=nx.kamada_kawai_layout(g),
             with_labels=with_labels)

    if hv:
        p = p.opts(plot=dict(width=width, height=width))
    return p


if __name__ == "__main__":
    df = pd.DataFrame(
        np.load('/tmp/philip_small_not_tiny_df.npy', allow_pickle=True))
    df['body_a'] = fetch_mapping('emdata4:8900', '28e6', 'segmentation',
                                 df['sv_a'])
    df['body_b'] = fetch_mapping('emdata4:8900', '28e6', 'segmentation',
                                 df['sv_b'])
    bois = set(df[['body_a', 'body_b']].values.reshape(-1))
    generate_mergereview_assignments_from_df('emdata4:8900', '28e6',
                                             'segmentation', df, bois, 10,
                                             '/tmp/philip-assignments')
Ejemplo n.º 12
0
def fetch_roi_synapses(server,
                       uuid,
                       synapses_instance,
                       rois,
                       fetch_labels=False,
                       return_partners=False,
                       processes=16):
    """
    Fetch the coordinates and (optionally) body labels for 
    all synapses that fall within the given ROIs.
    
    Args:
    
        server:
            DVID server, e.g. 'emdata4:8900'
        
        uuid:
            DVID uuid, e.g. 'abc9'
        
        synapses_instance:
            DVID synapses instance name, e.g. 'synapses'
        
        rois:
            A single DVID ROI instance names or a list of them, e.g. 'EB' or ['EB', 'FB']
        
        fetch_labels:
            If True, also fetch the supervoxel and body label underneath each synapse,
            returned in columns 'sv' and 'body'.
            
        return_partners:
            If True, also return the partners table.

        processes:
            How many parallel processes to use when fetching synapses and supervoxel labels.
    
    Returns:
        pandas DataFrame with columns:
        ``['z', 'y', 'x', 'kind', 'conf']`` and ``['sv', 'body']`` (if ``fetch_labels=True``)
        If return_partners is True, also return the partners table.

    Example:
        df = fetch_roi_synapses('emdata4:8900', '3c281', 'synapses', ['PB(L5)', 'PB(L7)'], True, 8)
    """
    # Late imports to avoid circular imports in dvid/__init__
    from neuclease.dvid import fetch_combined_roi_volume, determine_point_rois, fetch_labels_batched, fetch_mapping, fetch_mappings

    assert rois, "No rois provided, result would be empty. Is that what you meant?"

    if isinstance(rois, str):
        rois = [rois]

    # Determine name of the segmentation instance that's
    # associated with the given synapses instance.
    syn_info = fetch_instance_info(server, uuid, synapses_instance)
    seg_instance = syn_info["Base"]["Syncs"][0]

    logger.info(f"Fetching mask for ROIs: {rois}")
    # Fetch the ROI as a low-res array (scale 5, i.e. 32-px resolution)
    roi_vol_s5, roi_box_s5, overlapping_pairs = fetch_combined_roi_volume(
        server, uuid, rois)

    if len(overlapping_pairs) > 0:
        logger.warning(
            "Some ROIs overlapped and are thus not completely represented in the output:\n"
            f"{overlapping_pairs}")

    # Convert to full-res box
    roi_box = (2**5) * roi_box_s5

    # fetch_synapses_in_batches() requires a box that is 64-px-aligned
    roi_box = round_box(roi_box, 64, 'out')

    logger.info("Fetching synapse points")
    # points_df is a DataFrame with columns for [z,y,x]
    points_df, partners_df = fetch_synapses_in_batches(server,
                                                       uuid,
                                                       synapses_instance,
                                                       roi_box,
                                                       processes=processes)

    # Append a 'roi_name' column to points_df
    logger.info("Labeling ROI for each point")
    determine_point_rois(server, uuid, rois, points_df, roi_vol_s5, roi_box_s5)

    logger.info("Discarding points that don't overlap with the roi")
    rois = {*rois}
    points_df = points_df.query('roi in @rois').copy()

    columns = ['z', 'y', 'x', 'kind', 'conf', 'roi_label', 'roi']

    if fetch_labels:
        logger.info("Fetching supervoxel under each point")
        svs = fetch_labels_batched(server,
                                   uuid,
                                   seg_instance,
                                   points_df[['z', 'y', 'x']].values,
                                   supervoxels=True,
                                   processes=processes)

        with Timer("Mapping supervoxels to bodies", logger):
            # Arbitrary heuristic for whether to do the
            # body-lookups on DVID or on the client.
            if len(svs) < 100_000:
                bodies = fetch_mapping(server, uuid, seg_instance, svs)
            else:
                mapping = fetch_mappings(server, uuid, seg_instance)
                mapper = LabelMapper(mapping.index.values, mapping.values)
                bodies = mapper.apply(svs, True)

        points_df['sv'] = svs
        points_df['body'] = bodies
        columns += ['body', 'sv']

    if return_partners:
        # Filter
        #partners_df = partners_df.query('post_id in @points_df.index and pre_id in @points_df.index').copy()

        # Faster filter (via merge)
        partners_df = partners_df.merge(points_df[[]],
                                        'inner',
                                        left_on='pre_id',
                                        right_index=True)
        partners_df = partners_df.merge(points_df[[]],
                                        'inner',
                                        left_on='post_id',
                                        right_index=True)
        return points_df[columns], partners_df
    else:
        return points_df[columns]