def test_2_exclusive_access(self):
        """
        Verify that the server does not grant simultaneous access
        to two clients if it is configured to allow only one at a time.
        """
        resource = 'my-resource'
        DELAY = 0.5

        client_1 = ResourceManagerClient('127.0.0.1', SERVER_PORT, _debug=True)
        client_2 = ResourceManagerClient('127.0.0.1', SERVER_PORT, _debug=True)

        task_started = threading.Event()

        def long_task():
            with client_1.access_context(resource, False, 1, 1000):
                task_started.set()
                time.sleep(DELAY)

        start = time.time()
        th = threading.Thread(target=long_task)
        th.start()

        task_started.wait()
        with client_2.access_context(resource, False, 1, 1000):
            assert time.time() - start >= DELAY, \
                "We shouldn't have been granted access to the resource so quickly!"

        th.join()
 def test_7_timeout(self):
     client = ResourceManagerClient('127.0.0.1', SERVER_PORT, _debug=True)
     try:
         client.read_config()
     except TimeoutError:
         return
     raise AssertionError("Expected a timeout error")
 def test_1_basic(self):
     """
     Request access to a resource.
     """
     resource = 'my-resource'
     client = ResourceManagerClient('127.0.0.1', SERVER_PORT, _debug=True)
     assert client.read_config()["write_reqs"] == 2
     with client.access_context(resource, False, 1, 1000):
         pass
    def test_5_pickle(self):
        """
        Copy the client via pickling and then use the copy after unpickling.
        """
        resource = 'my-resource'
        client = ResourceManagerClient('127.0.0.1', SERVER_PORT, _debug=True)
        with client.access_context(resource, False, 1, 1000):
            pass

        pickled_client = pickle.dumps(client)
        unpickled_client = pickle.loads(pickled_client)

        with unpickled_client.access_context(resource, False, 1, 1000):
            pass
Exemple #5
0
    def _init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        """
        input_config = self.config["input"]
        output_config = self.config["output"]
        mgr_options = self.config["resource-manager"]

        self.mgr_client = ResourceManagerClient(mgr_options["server"],
                                                mgr_options["port"])
        self.input_service = VolumeService.create_from_config(
            input_config, self.mgr_client)

        # For services that support the 'max-scale' creation setting (n5, dvid, zarr),
        # auto-set the max-scale based on the max-pyramid-scale config setting.
        VolumeService.remove_default_service_configs(output_config)
        multiscale_service = {*output_config.keys()} & {'dvid', 'n5', 'zarr'}
        if multiscale_service:
            svc = [*multiscale_service][0]
            if output_config[svc]["create-if-necessary"]:
                max_pyramid_scale = self.config["copygrayscale"][
                    "max-pyramid-scale"]
                max_creation_scale = output_config[svc]["creation-settings"][
                    "max-scale"]
                if max_creation_scale == -1:
                    output_config[svc]["creation-settings"][
                        "max-scale"] = max_pyramid_scale
                elif max_creation_scale < max_pyramid_scale:
                    msg = (
                        f"Your volume creation-settings specify a lower max-scale ({max_creation_scale}) "
                        f"than your CopyGrayscale config max-pyramid-scale ({max_pyramid_scale}).\n"
                        "Change your creation-settings max-scale or remove it from the config so a default can be chosen."
                    )
                    raise RuntimeError(msg)

        replace_default_entries(output_config["geometry"]["bounding-box"],
                                self.input_service.bounding_box_zyx[:, ::-1])
        self.output_service = VolumeService.create_from_config(
            output_config, self.mgr_client)
        assert isinstance( self.output_service, VolumeServiceWriter ), \
            "The output format you are attempting to use does not support writing"

        logger.info(
            f"Output bounding box: {self.output_service.bounding_box_zyx[:,::-1].tolist()}"
        )

        # We use node-local dvid servers when uploading to a gbucket backend,
        # and the gbucket backend needs to be explicitly reloaded
        # (TODO: Is this still true, or has it been fixed by now?)
        if isinstance(
                self.output_service, DvidVolumeService
        ) and self.output_service.server.startswith("http://127.0.0.1"):
            server = self.output_service.server

            @auto_retry(3, 5.0, __name__)
            def reload_meta():
                reload_metadata(server)

            self.run_on_each_worker(reload_meta, once_per_machine=True)
Exemple #6
0
    def _execute_labelindices(self, mapping_df):
        config = self.config_data
        options = config["options"]
        resource_manager_client = ResourceManagerClient(
            options["resource-server"], options["resource-port"])

        last_mutid = options["mutation-id"]
        server = config["dvid"]["server"]
        uuid = config["dvid"]["uuid"]
        instance_name = config["dvid"]["segmentation-name"]
        endpoint = f'{server}/api/node/{uuid}/{instance_name}/indices'

        processor = StatsBatchProcessor(last_mutid, endpoint)

        # Load the h5 file
        block_sv_stats = load_stats_h5_to_records(config["block-stats-file"])

        # Note: Initializing this generator involves sorting the (very large) stats array
        batch_rows = options["batch-row-count"]
        batch_generator = generate_stats_batches(block_sv_stats, mapping_df,
                                                 batch_rows)

        batches = self.sc.parallelize(batch_generator,
                                      cpus_per_worker() * num_worker_nodes())
        rt.persist_and_execute(batches, "Distributing batches", logger)

        def process_batch(item):
            stats_batch, total_rows = item
            approximate_bytes = 30 * total_rows  # this is highly unscientific
            with resource_manager_client.access_context(
                    server, False, 1, approximate_bytes):
                processor.process_batch((stats_batch, total_rows))

        with Timer("Processing/sending batches", logger):
            batches.foreach(process_batch)
Exemple #7
0
    def init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        """
        mgr_config = self.config["resource-manager"]
        resource_mgr_client = ResourceManagerClient(mgr_config["server"],
                                                    mgr_config["port"])

        body_seg_config = self.config["body-seg"]
        mito_seg_config = self.config["mito-seg"]

        body_svc = VolumeService.create_from_config(body_seg_config,
                                                    resource_mgr_client)
        mito_svc = VolumeService.create_from_config(mito_seg_config,
                                                    resource_mgr_client)

        if isinstance(body_svc.base_service, DvidVolumeService):
            assert not body_svc.base_service.supervoxels, \
                "body segmentation source shouldn't be a supervoxel source."

        if isinstance(mito_svc.base_service, DvidVolumeService):
            assert body_svc.base_service.supervoxels, \
                "mito segmentation source MUST be a supervoxel souce. 'Grouped' mitos are not appropriate for this computation."

        return body_svc, mito_svc
    def init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        """
        left_config = self.config["left-input"]
        right_config = self.config["right-input"]
        mgr_config = self.config["resource-manager"]

        self.resource_mgr_client = ResourceManagerClient(
            mgr_config["server"], mgr_config["port"])
        self.left_service = VolumeService.create_from_config(
            left_config, self.resource_mgr_client)
        self.right_service = VolumeService.create_from_config(
            right_config, self.resource_mgr_client)

        if (self.left_service.bounding_box_zyx !=
                self.right_service.bounding_box_zyx).any():
            raise RuntimeError(
                "Your left and right input volumes do not have the same bounding box.  Please specify explicit bounding boxes."
            )

        logger.info(
            f"Bounding box: {self.left_service.bounding_box_zyx[:,::-1].tolist()}"
        )

        if (self.left_service.preferred_message_shape !=
                self.right_service.preferred_message_shape).any():
            raise RuntimeError(
                "Your left and right input volumes must use the same message-block-shape."
            )
Exemple #9
0
    def _init_service(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        
        Also check the service configurations for errors.
        """
        input_config = self.config["input"]
        mgr_options = self.config["resource-manager"]

        self.mgr_client = ResourceManagerClient(mgr_options["server"],
                                                mgr_options["port"])
        self.input_service = VolumeService.create_from_config(
            input_config, self.mgr_client)

        assert isinstance(self.input_service.base_service, DvidVolumeService), \
            "Only DVID sources are permitted by this workflow."

        assert not (self.input_service.bounding_box_zyx % self.input_service.block_width).any(), \
            "Input bounding box must be a multiple of the block width"

        if isinstance(self.input_service, ScaledVolumeService):
            scale = self.input_service.scale_delta
            assert scale <= 5, "Can't use rescale-level > 5 in this workflow."
            return scale
        return 0
Exemple #10
0
    def init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        """
        mgr_config = self.config["resource-manager"]
        seg_config = self.config["mito-seg"]
        mask_config = self.config["mito-masks"]

        resource_mgr_client = ResourceManagerClient(mgr_config["server"],
                                                    mgr_config["port"])
        seg_service = VolumeService.create_from_config(seg_config,
                                                       resource_mgr_client)
        logger.info(
            f"Bounding box: {seg_service.bounding_box_zyx[:,::-1].tolist()}")

        replace_default_entries(mask_config["geometry"]["bounding-box"],
                                seg_service.bounding_box_zyx[:, ::-1])
        mask_service = VolumeService.create_from_config(
            mask_config, resource_mgr_client)

        if (seg_service.preferred_message_shape !=
                mask_service.preferred_message_shape).any():
            raise RuntimeError(
                "Your input volume and mask volume must use the same message-block-shape."
            )

        return seg_service, mask_service
Exemple #11
0
    def init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        """
        primary_config = self.config["primary-input"]
        contingency_config = self.config["contingency-input"]
        output_config = self.config["output"]

        mgr_config = self.config["resource-manager"]

        self.resource_mgr_client = ResourceManagerClient(
            mgr_config["server"], mgr_config["port"])
        self.primary_service = VolumeService.create_from_config(
            primary_config, self.resource_mgr_client)
        self.contingency_service = VolumeService.create_from_config(
            contingency_config, self.resource_mgr_client)

        # Replace 'auto' dimensions with input bounding box
        replace_default_entries(output_config["geometry"]["bounding-box"],
                                self.primary_service.bounding_box_zyx[:, ::-1])
        replace_default_entries(
            output_config["geometry"]["message-block-shape"],
            self.primary_service.preferred_message_shape[::-1])

        self.output_service = VolumeService.create_from_config(
            output_config, self.resource_mgr_client)

        assert isinstance(self.output_service, VolumeServiceWriter)

        if (self.primary_service.bounding_box_zyx !=
                self.contingency_service.bounding_box_zyx).any():
            raise RuntimeError(
                "Your primary and contingency input volumes do not have the same bounding box.  Please specify explicit bounding boxes."
            )

        if (self.output_service.bounding_box_zyx !=
                self.primary_service.bounding_box_zyx).any():
            raise RuntimeError(
                "Your output volume bounding box doesn't match the input volumes.  Please specify explicit bounding boxes."
            )

        logger.info(
            f"Bounding box: {self.primary_service.bounding_box_zyx[:,::-1].tolist()}"
        )

        if (self.primary_service.preferred_message_shape !=
                self.contingency_service.preferred_message_shape).any():
            raise RuntimeError(
                "Your primary and contingency input volumes must use the same message-block-shape."
            )

        if (self.output_service.preferred_message_shape !=
                self.primary_service.preferred_message_shape).any():
            raise RuntimeError(
                "Your input and output volumes must use the same message-block-shape."
            )
Exemple #12
0
    def __init__(self, volume_config, resource_manager_client=None):
        validate(volume_config, BrainMapsVolumeSchema, inject_defaults=True)

        if resource_manager_client is None:
            # Dummy client
            resource_manager_client = ResourceManagerClient("", 0)

        self._brainmaps_client = BrainMapsVolume(
            str(volume_config["brainmaps"]["project"]),
            volume_config["brainmaps"]["dataset"],
            volume_config["brainmaps"]["volume-id"],
            volume_config["brainmaps"]["change-stack-id"],
            dtype=None,
            use_gzip=volume_config["brainmaps"]["use-gzip"])

        # Force client to fetch dtype now, so it isn't fetched after pickling.
        self._brainmaps_client.dtype

        block_width = volume_config["geometry"]["block-width"]
        if block_width == -1:
            # FIXME: I don't actually know what BrainMap's internal block size is...
            block_width = 64

        preferred_message_shape_zyx = np.array(
            volume_config["geometry"]["message-block-shape"][::-1])
        replace_default_entries(preferred_message_shape_zyx, [64, 64, 6400])

        bounding_box_zyx = np.array(
            volume_config["geometry"]["bounding-box"])[:, ::-1]
        replace_default_entries(bounding_box_zyx,
                                self._brainmaps_client.bounding_box)

        assert  (bounding_box_zyx[0] >= self._brainmaps_client.bounding_box[0]).all() \
            and (bounding_box_zyx[1] <= self._brainmaps_client.bounding_box[1]).all(), \
            f"Specified bounding box ({bounding_box_zyx.tolist()}) extends outside the "\
            f"BrainMaps volume geometry ({self._brainmaps_client.bounding_box.tolist()})"

        available_scales = list(volume_config["geometry"]["available-scales"])
        fetch_blockwise = volume_config["brainmaps"]["fetch-blockwise"]

        # Store members
        self._bounding_box_zyx = bounding_box_zyx
        self._resource_manager_client = resource_manager_client
        self._preferred_message_shape_zyx = preferred_message_shape_zyx
        self._block_width = block_width
        self._available_scales = available_scales
        self._fetch_blockwise = fetch_blockwise

        # Overwrite config entries that we might have modified
        volume_config["geometry"]["block-width"] = self._block_width
        volume_config["geometry"][
            "bounding-box"] = self._bounding_box_zyx[:, ::-1].tolist()
        volume_config["geometry"][
            "message-block-shape"] = self._preferred_message_shape_zyx[::
                                                                       -1].tolist(
                                                                       )
 def resource_manager_client(self):
     """
     Return the base_service's resource manager client.
     If the base service doesn't override this property,
     the default is to return a dummy client.
     (See dvid_resource_manager/client.py)
     """
     if self.base_service is self:
         # Dummy client
         return ResourceManagerClient("", 0)
     return self.base_service.resource_manager_client
Exemple #14
0
    def _init_service(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.

        Also check the service configurations for errors.
        """
        input_config = self.config["input"]
        mgr_options = self.config["resource-manager"]

        self.mgr_client = ResourceManagerClient(mgr_options["server"],
                                                mgr_options["port"])
        self.input_service = VolumeService.create_from_config(
            input_config, self.mgr_client)
    def _init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        
        Also check the service configurations for errors.
        """
        input_config = self.config["input"]
        output_config = self.config["output"]
        mgr_options = self.config["resource-manager"]

        self.mgr_client = ResourceManagerClient(mgr_options["server"],
                                                mgr_options["port"])
        self.input_service = VolumeService.create_from_config(
            input_config, self.mgr_client)

        assert isinstance(self.input_service.base_service, DvidVolumeService)
        assert self.input_service.base_service.supervoxels, \
            'DVID input service config must use "supervoxels: true"'

        assert not output_config["dvid"]["create-if-necessary"], \
            "This workflow is designed to write to pre-existing DVID instances, not create them from scratch."

        # Replace 'auto' dimensions with input bounding box
        replace_default_entries(output_config["geometry"]["bounding-box"],
                                self.input_service.bounding_box_zyx[:, ::-1])
        self.output_service = VolumeService.create_from_config(
            output_config, self.mgr_client)
        output_service = self.output_service
        assert isinstance(output_service.base_service, DvidVolumeService)
        assert output_service.base_service.supervoxels, \
            'DVID output service config must use "supervoxels: true"'

        assert output_service.disable_indexing, \
            "During ingestion, indexing should be disabled.\n" \
            "Please add 'disable-indexing':true to your output dvid config."

        logger.info(
            f"Output bounding box (xyz) is: {output_service.bounding_box_zyx[:,::-1].tolist()}"
        )

        assert not (self.input_service.bounding_box_zyx % self.input_service.block_width).any(), \
            "Input bounding box must be a multiple of the block width"

        assert (self.input_service.bounding_box_zyx == self.output_service.bounding_box_zyx).all(), \
            "Input bounding box and output bounding box must be identical."

        # FIXME: output message shape should match input message shape
        assert not any(np.array(output_service.preferred_message_shape) % output_service.block_width), \
            "Output message-block-shape should be a multiple of the block size in all dimensions."
    def _init_services(self):
        input_config = self.config_data["input"]
        output_config = self.config_data["output"]
        options = self.config_data["options"]

        self.mgr_client = ResourceManagerClient(options["resource-server"],
                                                options["resource-port"])
        self.input_service = VolumeService.create_from_config(
            input_config, self.config_dir, self.mgr_client)

        # Auto-set output size, if necessary.
        replace_default_entries(output_config["geometry"]["bounding-box"],
                                self.input_service.bounding_box_zyx[:, ::-1])
        self.output_service = SliceFilesVolumeServiceWriter(
            output_config, self.config_dir)
    def test_3_parallel_read_write_access(self):
        """
        Verify that a single client can be used from multiple threads
        (since it creates low-level _ResourceManagerClient objects per-thread as needed).
        
        Also, verify that the server DOES grant simultaneous access
        to two threads if one is reading and the other is writing,
        as long as neither is over capacity already.
        """
        resource = 'my-resource'
        DELAY = 0.5

        # Without this sleep this test fails intermittently with a strange error.
        # See https://github.com/janelia-flyem/DVIDResourceManager/issues/4
        # Does the server need time to initialize?
        time.sleep(0.5)

        client = ResourceManagerClient('127.0.0.1', SERVER_PORT, _debug=True)

        task_started = threading.Event()

        def long_task():
            with client.access_context(resource, True, 1, 1000):
                task_started.set()
                time.sleep(DELAY)

        start = time.time()
        th = threading.Thread(target=long_task)
        th.start()

        task_started.wait()
        with client.access_context(resource, False, 1, 1000):
            assert (time.time() - start) < DELAY, \
                "The server seems to have incorrectly forbidden parallel access for reading and writing."

        th.join()
    def test_6_reconfigure(self):
        client = ResourceManagerClient('127.0.0.1', SERVER_PORT, _debug=True)
        orig_config = client.read_config()

        new_config = orig_config.copy()
        new_config["read_reqs"] = 123
        new_config["write_reqs"] = 456

        client.reconfigure_server(new_config)
        assert client.read_config() == new_config
Exemple #19
0
    def _init_service(self):
        options = self.config["sparsemeshes"]
        input_config = self.config["input"]
        mgr_options = self.config["resource-manager"]

        self.mgr_client = ResourceManagerClient(mgr_options["server"],
                                                mgr_options["port"])
        self.input_service = VolumeService.create_from_config(
            input_config, self.mgr_client)
        assert isinstance(self.input_service, DvidVolumeService), \
            "Input must be plain dvid source, not scaled, transposed, etc."

        min_scale = options["min-scale"]
        max_scale = max(self.input_service.available_scales)
        assert min_scale <= max_scale, \
            f"Largest available scale in the input ({max_scale}) is smaller than the min-scale you provided ({min_scale})."
Exemple #20
0
    def init_brickwall(self):
        input_config = self.config["input"]
        mask_input_config = self.config["mask-input"]
        mgr_config = self.config["resource-manager"]
        options = self.config["sparseblockstats"]
        
        resource_mgr_client = ResourceManagerClient( mgr_config["server"], mgr_config["port"] )
        input_service = VolumeService.create_from_config( input_config, resource_mgr_client )
        mask_service = VolumeService.create_from_config( mask_input_config, resource_mgr_client )
        
        assert (input_service.preferred_message_shape == mask_service.preferred_message_shape).all(), \
            "This workflow assumes that the input and the mask-input use the same brick grid."

        assert not (input_service.preferred_message_shape % input_service.block_width).any(), \
            "input brick grid spacing must be a multipe of the input's block-width"
        assert not (mask_service.preferred_message_shape % mask_service.block_width).any(), \
            "mask brick grid spacing must be a multipe of the input's block-width"

        is_supervoxels = False
        if isinstance(mask_service.base_service, DvidVolumeService):
            is_supervoxels = mask_service.base_service.supervoxels

        # Load body list and eliminate duplicates
        subset_labels = load_body_list(options["mask-labels"], is_supervoxels)
        subset_labels = set(subset_labels)

        if not subset_labels:
            raise RuntimeError("You didn't specify any mask subset labels. "
                               "If you want to compute block stats for an entire segmentation volume, use the CopySegmentation workflow.")

        sbm = mask_service.sparse_block_mask_for_labels(subset_labels)
        if ((sbm.box[1] - sbm.box[0]) == 0).any():
            raise RuntimeError("Could not find sparse masks for any of the mask-labels")

        with Timer("Initializing BrickWall", logger):
            # Aim for 2 GB RDD partitions when loading segmentation
            GB = 2**30
            target_partition_size_voxels = 2 * GB // np.uint64().nbytes
            brickwall = BrickWall.from_volume_service(input_service, 0, None, self.client, target_partition_size_voxels, 0, sbm)

            # Pad if necessary to ensure that all fetched bricks are block-aligned
            block_shape = 3*(input_service.block_width,)
            brickwall = brickwall.fill_missing(input_service.get_subvolume, Grid(block_shape))

        return brickwall
    def init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        """
        mgr_config = self.config["resource-manager"]
        input_config = self.config["input"]
        mask_config = self.config["mask"]
        output_config = self.config["output"]

        resource_mgr_client = ResourceManagerClient(mgr_config["server"],
                                                    mgr_config["port"])
        input_service = VolumeService.create_from_config(
            input_config, resource_mgr_client)
        logger.info(
            f"Bounding box: {input_service.bounding_box_zyx[:,::-1].tolist()}")

        # Replace default entries in the output and mask bounding boxes
        replace_default_entries(mask_config["geometry"]["bounding-box"],
                                input_service.bounding_box_zyx[:, ::-1])
        replace_default_entries(output_config["geometry"]["bounding-box"],
                                input_service.bounding_box_zyx[:, ::-1])

        mask_service = VolumeService.create_from_config(
            mask_config, resource_mgr_client)
        output_service = VolumeService.create_from_config(
            output_config, resource_mgr_client)

        if (input_service.preferred_message_shape !=
                mask_service.preferred_message_shape).any():
            raise RuntimeError(
                "Your input volume and mask volume must use the same message-block-shape."
            )

        if isinstance(output_service.base_service, DvidVolumeService
                      ) and not output_service.base_service.write_empty_blocks:
            logger.warning(
                "Your output config does not set write-empty-blocks: True "
                "-- consider changing that to avoid writing needless zeros to DVID!."
            )

        return input_service, mask_service, output_service
def post_swcs_to_dvid(config, items):
    """
    Send the given SWC files as key/value pairs to DVID.
    
    Args:
        config: The CreateSkeletons workflow config data
    
        items: list-of-tuples (body_id, swc_text, error_text)
               If swc_text is None or error_text is NOT None, then nothing is posted.
               (We could have filtered out such items upstream, but it's convenient to just handle it here.)
    """
    # Re-use session for connection pooling.
    session = default_dvid_session()

    # Re-use resource manager client connections, too.
    # (If resource-server is empty, this will return a "dummy client")
    resource_client = ResourceManagerClient(
        config["options"]["resource-server"],
        config["options"]["resource-port"])

    dvid_server = config["dvid-info"]["dvid"]["server"]
    uuid = config["dvid-info"]["dvid"]["uuid"]
    instance = config["dvid-info"]["dvid"]["skeletons-destination"]

    for (body_id, swc_contents, err) in items:
        if swc_contents is None or err is not None:
            continue

        swc_contents = swc_contents.encode('utf-8')

        @auto_retry(3, pause_between_tries=60.0, logging_name=__name__)
        def write_swc():
            with resource_client.access_context(dvid_server, False, 1,
                                                len(swc_contents)):
                session.post(
                    f'{dvid_server}/api/node/{uuid}/{instance}/key/{body_id}_swc',
                    swc_contents)

        write_swc()
    def init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.
        """
        input_config = self.config["input"]
        output_config = self.config["output"]
        mgr_config = self.config["resource-manager"]

        self.resource_mgr_client = ResourceManagerClient( mgr_config["server"], mgr_config["port"] )
        self.input_service = VolumeService.create_from_config( input_config, self.resource_mgr_client )

        # If we need to create a dvid instance for the output,
        # default to the same pyramid depth as the input
        if ("dvid" in input_config) and ("dvid" in output_config) and (output_config["dvid"]["creation-settings"]["max-scale"] == -1):
            info = fetch_instance_info(*self.input_service.base_service.instance_triple)
            pyramid_depth = info['Extended']['MaxDownresLevel']
            output_config["dvid"]["creation-settings"]["max-scale"] = pyramid_depth

        replace_default_entries(output_config["geometry"]["bounding-box"], self.input_service.bounding_box_zyx[:, ::-1])

        self.output_service = VolumeService.create_from_config( output_config, self.resource_mgr_client )
        assert isinstance( self.output_service, VolumeServiceWriter ), \
            "The output format you are attempting to use does not support writing"

        if isinstance(self.output_service.base_service, DvidVolumeService):
            if not self.output_service.base_service.supervoxels:
                raise RuntimeError("Can't write to a non-supervoxels output service.")

            if not self.output_service.base_service.disable_indexing:
                logger.warning("******************************************************************************")
                logger.warning("Your output config does not specify 'disable-indexing', which means DVID will "
                               "attempt to index all voxels as they are written to the volume. "
                               "For large volumes, this is NOT recommended!"
                               "(You should run a separate job to recompute the labelindex afterwards.)")
                logger.warning("******************************************************************************")

        logger.info(f"Output bounding box: {self.output_service.bounding_box_zyx[:,::-1].tolist()}")
Exemple #24
0
    def execute(self):
        options = self.config["mitodistances"]
        output_dir = self.config["output-directory"]
        body_svc, mito_svc = self.init_services()

        # Resource manager context must be initialized before resource manager client
        # (to overwrite config values as needed)
        dvid_mgr_config = self.config["dvid-access-manager"]
        dvid_mgr_context = LocalResourceManager(dvid_mgr_config)
        dvid_mgr_client = ResourceManagerClient(dvid_mgr_config["server"],
                                                dvid_mgr_config["port"])

        syn_server, syn_uuid, syn_instance = (options['synapse-criteria'][k]
                                              for k in ('server', 'uuid',
                                                        'instance'))
        syn_conf = float(options['synapse-criteria']['confidence'])
        syn_types = ['PreSyn', 'PostSyn']
        if options['synapse-criteria']['type'] == 'pre':
            syn_types = ['PreSyn']
        elif options['synapse-criteria']['type'] == 'post':
            syn_types = ['PostSyn']

        bodies = load_body_list(options["bodies"], False)
        skip_flags = [
            os.path.exists(f'{output_dir}/{body}.csv') for body in bodies
        ]
        bodies_df = pd.DataFrame({'body': bodies, 'should_skip': skip_flags})
        bodies = bodies_df.query('not should_skip')['body']

        # Shuffle for better load balance?
        # TODO: Would be better to sort by synapse count, and put large bodies first,
        #       assigned to partitions in round-robin style.
        #       Then work stealing will be more effective at knocking out the smaller jobs at the end.
        #       This requires knowing all the body sizes, though.
        #       Perhaps mito count would be a decent proxy for synapse count, and it's readily available.
        #bodies = bodies.sample(frac=1.0).values

        os.makedirs('body-logs')
        os.makedirs(output_dir, exist_ok=True)

        mito_server, mito_uuid, mito_instance = (options['mito-labelmap'][k]
                                                 for k in ('server', 'uuid',
                                                           'instance'))

        @auto_retry(3)
        def _fetch_synapses(body):
            with dvid_mgr_client.access_context(syn_server, True, 1, 1):
                syn_df = fetch_annotation_label(syn_server,
                                                syn_uuid,
                                                syn_instance,
                                                body,
                                                format='pandas')
                if len(syn_df) == 0:
                    return syn_df
                syn_types, syn_conf
                syn_df = syn_df.query(
                    'kind in @syn_types and conf >= @syn_conf').copy()
                return syn_df[[*'xyz', 'kind', 'conf'
                               ]].sort_values([*'xyz']).reset_index(drop=True)

        @auto_retry(3)
        def _fetch_mito_ids(body):
            with dvid_mgr_client.access_context(mito_server, True, 1, 1):
                try:
                    return fetch_supervoxels(mito_server, mito_uuid,
                                             mito_instance, body)
                except HTTPError:
                    return []

        def process_and_save(body):
            tbars = _fetch_synapses(body)
            valid_mitos = _fetch_mito_ids(body)

            # TODO:
            #   Does the stdout_redirected() mechanism work correctly in the context of multiprocessing?
            #   If not, I should probably just use a custom logging handler instead.
            with open(f"body-logs/{body}.log",
                      "w") as f, stdout_redirected(f), Timer() as timer:
                processed_tbars = []
                if len(tbars) == 0:
                    logging.getLogger(__name__).warning(
                        f"Body {body}: No synapses found")

                if len(valid_mitos) == 0:
                    logging.getLogger(__name__).warning(
                        f"Body {body}: Failed to fetch mito supervoxels")
                    processed_tbars = initialize_results(body, tbars)

                if len(valid_mitos) and len(tbars):
                    processed_tbars = measure_tbar_mito_distances(
                        body_svc,
                        mito_svc,
                        body,
                        tbars=tbars,
                        valid_mitos=valid_mitos)

            if len(processed_tbars) > 0:
                processed_tbars.to_csv(f'{output_dir}/{body}.csv',
                                       header=True,
                                       index=False)
                with open(f'{output_dir}/{body}.pkl', 'wb') as f:
                    pickle.dump(processed_tbars, f)

            if len(tbars) == 0:
                return (body, 0, 'no-synapses', timer.seconds)

            if len(valid_mitos) == 0:
                return (body, len(processed_tbars), 'no-mitos', timer.seconds)

            return (body, len(tbars), 'success', timer.seconds)

        logger.info(
            f"Processing {len(bodies)}, skipping {bodies_df['should_skip'].sum()}"
        )

        def process_batch(bodies):
            return [*map(process_and_save, bodies)]

        with dvid_mgr_context:
            batch_size = max(1, len(bodies) // 10_000)
            futures = self.client.map(process_batch,
                                      iter_batches(bodies, batch_size))

            # Support synchronous testing with a fake 'as_completed' object
            if hasattr(self.client, 'DEBUG'):
                ac = as_completed_synchronous(futures, with_results=True)
            else:
                ac = distributed.as_completed(futures, with_results=True)

            try:
                results = []
                for f, r in tqdm_proxy(ac, total=len(futures)):
                    results.extend(r)
            finally:
                results = pd.DataFrame(
                    results,
                    columns=['body', 'synapses', 'status', 'processing_time'])
                results.to_csv('results-summary.csv', header=True, index=False)
                num_errors = len(results.query('status == "error"'))
                if num_errors:
                    logger.warning(
                        f"Encountered {num_errors} errors. See results-summary.csv"
                    )
Exemple #25
0
    def _init_services(self):
        """
        Initialize the input and output services,
        and fill in 'auto' config values as needed.

        Also check the service configurations for errors.
        """
        input_config = self.config["input"]
        output_config = self.config["output"]
        mgr_options = self.config["resource-manager"]

        options = self.config["copysegmentation"]
        slab_depth = options["slab-depth"]
        pyramid_depth = options["pyramid-depth"]
        permit_inconsistent_pyramids = options["permit-inconsistent-pyramid"]

        self.mgr_client = ResourceManagerClient(mgr_options["server"],
                                                mgr_options["port"])
        self.input_service = VolumeService.create_from_config(
            input_config, self.mgr_client)

        brick_shape = self.input_service.preferred_message_shape
        if slab_depth % brick_shape[0] != 0:
            self.input_service.preferred_message_shape[0]
            logger.warning(
                f"Your slab-depth {slab_depth} is not a multiple of the input's brick width {brick_shape[0]}"
            )

        if isinstance(self.input_service.base_service, DvidVolumeService):
            assert input_config["dvid"]["supervoxels"], \
                'DVID input service config must use "supervoxels: true"'

        # Special handling for creation of multi-scale outputs:
        # auto-configure the pyramid depths
        multiscale_output_type = None
        for t in ["dvid", "n5", "zarr"]:
            if t in output_config and not hasattr(output_config[t],
                                                  'from_default'):
                multiscale_output_type = t
        if multiscale_output_type:
            out_fmt = multiscale_output_type
            if output_config[out_fmt]["create-if-necessary"]:
                if self.config["copysegmentation"][
                        "skip-scale-0-write"] and pyramid_depth == 0:
                    # Nothing to write.  Maybe the user is just computing block statistics.
                    msg = (
                        "Since your config specifies no pyramid levels to write, no output instance will be created. "
                        "Avoid this warning by removing 'create-if-necessary' from your config"
                    )
                    logger.warning(msg)
                    output_config[out_fmt]["create-if-necessary"] = False
                else:
                    max_scale = output_config[out_fmt]["creation-settings"][
                        "max-scale"]
                    if max_scale not in (-1, pyramid_depth):
                        msg = (
                            f"Inconsistent max-scale ({max_scale}) and pyramid-depth ({pyramid_depth}). "
                            "Omit max-scale from your creation-settings.")
                        raise RuntimeError(msg)
                    output_config[out_fmt]["creation-settings"][
                        "max-scale"] = pyramid_depth

        # Replace 'auto' dimensions with input bounding box
        replace_default_entries(output_config["geometry"]["bounding-box"],
                                self.input_service.bounding_box_zyx[:, ::-1])
        self.output_service = VolumeService.create_from_config(
            output_config, self.mgr_client)
        output_service = self.output_service
        assert isinstance(output_service, VolumeServiceWriter)

        if "dvid" in output_config:
            assert output_config["dvid"]["supervoxels"], \
                'DVID output service config must use "supervoxels: true"'

            if output_service.instance_name in fetch_repo_instances(
                    output_service.server, output_service.uuid):
                existing_depth = self._read_pyramid_depth()
                if pyramid_depth not in (
                        -1,
                        existing_depth) and not permit_inconsistent_pyramids:
                    raise Exception(
                        f"Can't set pyramid-depth to {pyramid_depth}: "
                        f"Data instance '{output_service.instance_name}' already existed, with depth {existing_depth}"
                    )

        # These services aren't supported because we copied some geometry (bounding-box)
        # directly from the input service.
        assert not isinstance(output_service, TransposedVolumeService)
        assert not isinstance(
            output_service,
            ScaledVolumeService) or output_service.scale_delta == 0

        if isinstance(self.output_service.base_service, DvidVolumeService):
            assert output_service.base_service.disable_indexing, \
                "During ingestion, dvid labelmap indexing should be disabled.\n" \
                "Please add 'disable-indexing: true' to your output dvid config."

        logger.info(
            f"Output bounding box (xyz) is: {output_service.bounding_box_zyx[:,::-1].tolist()}"
        )

        input_shape = -np.subtract(*self.input_service.bounding_box_zyx)
        output_shape = -np.subtract(*output_service.bounding_box_zyx)

        assert not any(np.array(output_service.preferred_message_shape) % output_service.block_width), \
            "Output message-block-shape should be a multiple of the block size in all dimensions."
        assert (input_shape == output_shape).all(), \
            "Input bounding box and output bounding box do not have the same dimensions"

        if ("apply-labelmap" in output_config["adapters"]) and (
                output_config["adapters"]["apply-labelmap"]["file-type"] !=
                "__invalid__"):
            assert output_config["adapters"]["apply-labelmap"]["apply-when"] == "reading-and-writing", \
                "Labelmap will be applied to voxels during pre-write and post-read (due to block padding).\n"\
                "You cannot use this workflow with non-idempotent labelmaps, unless your data is already perfectly block aligned."
def post_meshes_to_dvid(config, instance_name, partition_items):
    """
    Send the given meshes (either .obj or .drc) as key/value pairs to DVID.
    
    Args:
        config: The CreateMeshes workflow config data
        
        instance_name: key-value instance to post to
            
        partition_items: tuple (group_id, [(segment_id, mesh_data), (segment_id, mesh_data)])
    """
    # Re-use session for connection pooling.
    session = default_dvid_session()

    # Re-use resource manager client connections, too.
    # (If resource-server is empty, this will return a "dummy client")
    resource_client = ResourceManagerClient(
        config["options"]["resource-server"],
        config["options"]["resource-port"])

    dvid_server = config["dvid-info"]["dvid"]["server"]
    uuid = config["dvid-info"]["dvid"]["uuid"]

    grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
    mesh_format = config["mesh-config"]["storage"]["format"]

    if grouping_scheme == "no-groups":
        for group_id, segment_ids_and_meshes in partition_items:
            for (segment_id, mesh_data) in segment_ids_and_meshes:

                @auto_retry(3, pause_between_tries=60.0, logging_name=__name__)
                def write_mesh():
                    with resource_client.access_context(
                            dvid_server, False, 2, len(mesh_data)):
                        session.post(
                            f'{dvid_server}/api/node/{uuid}/{instance_name}/key/{segment_id}',
                            mesh_data)
                        session.post(
                            f'{dvid_server}/api/node/{uuid}/{instance_name}/key/{segment_id}_info',
                            json={'format': mesh_format})

                write_mesh()
    else:
        # All other grouping schemes, including 'singletons' write tarballs.
        # (In the 'singletons' case, there is just one tarball per body.)
        for group_id, segment_ids_and_meshes in partition_items:
            tar_name = _get_group_name(config, group_id)
            tar_stream = BytesIO()
            with closing(tarfile.open(tar_name, 'w', tar_stream)) as tf:
                for (segment_id, mesh_data) in segment_ids_and_meshes:
                    mesh_name = _get_mesh_name(config, segment_id)
                    f_info = tarfile.TarInfo(mesh_name)
                    f_info.size = len(mesh_data)
                    tf.addfile(f_info, BytesIO(mesh_data))

            tar_bytes = tar_stream.getbuffer()

            @auto_retry(3, pause_between_tries=60.0, logging_name=__name__)
            def write_tar():
                with resource_client.access_context(dvid_server, False, 1,
                                                    len(tar_bytes)):
                    session.post(
                        f'{dvid_server}/api/node/{uuid}/{instance_name}/key/{tar_name}',
                        tar_bytes)

            write_tar()
    def execute(self):
        import pandas as pd
        self._sanitize_config()

        config = self.config_data
        options = config["options"]

        resource_mgr_client = ResourceManagerClient(options["resource-server"],
                                                    options["resource-port"])
        volume_service = VolumeService.create_from_config(
            config["dvid-info"], self.config_dir, resource_mgr_client)

        self._init_meshes_instances()

        # Aim for 2 GB RDD partitions
        GB = 2**30
        target_partition_size_voxels = 2 * GB // np.uint64().nbytes

        # This will return None if we're not using sparse blocks
        sparse_block_mask = self._get_sparse_block_mask(volume_service)

        brick_wall = BrickWall.from_volume_service(
            volume_service, 0, None, self.sc, target_partition_size_voxels,
            sparse_block_mask)
        brick_wall.persist_and_execute("Downloading segmentation", logger)

        # brick -> [ (segment_label, (box, mask, count)),
        #            (segment_label, (box, mask, count)), ... ]
        segments_and_masks = brick_wall.bricks.map(
            partial(compute_segment_masks, config))
        persist_and_execute(segments_and_masks,
                            "Computing brick-local segment masks", logger)
        brick_wall.unpersist()
        del brick_wall

        with Timer("Computing segment statistics", logger):
            mask_stats_df = self.compute_mask_stats(segments_and_masks)

        # Flatten now, AFTER stats have been computed
        # (compute_mask_stats() requires that the RDDs not have duplicate labels in them.)
        # While we're at it, drop the count (not needed any more)
        # --> (segment_label, (box, mask))
        def drop_count(items):
            new_items = []
            for item in items:
                segment_label, (box, mask, _count) = item
                new_items.append((segment_label, (box, mask)))
            return new_items

        segments_and_masks = segments_and_masks.flatMap(drop_count)

        bad_segments = mask_stats_df[[
            'segment', 'compressed_bytes'
        ]].query('compressed_bytes > 1.9e9')['segment']
        if len(bad_segments) > 0:
            logger.error(
                f"SOME SEGMENTS (N={len(bad_segments)}) ARE TOO BIG TO PROCESS.  Skipping segments: {list(bad_segments)}."
            )
            segments_and_masks = segments_and_masks.filter(
                lambda seg_mask: seg_mask[0] not in bad_segments.values)

        # (segment, (box, mask))
        #   --> (segment, boxes_and_masks)
        #   === (segment, [(box, mask), (box, mask), (box, mask), ...])
        masks_by_segment_id = segments_and_masks.groupByKey()
        persist_and_execute(masks_by_segment_id,
                            "Grouping segment masks by segment label ID",
                            logger)
        segments_and_masks.unpersist()
        del segments_and_masks

        # Insert chosen downsample_factor (a.k.a. dsf)
        #   --> (segment, dsf_and_boxes_and_masks)
        #   === (segment, (downsample_factor, [(box, mask), (box, mask), (box, mask), ...]))
        downsample_df = pd.Series(
            mask_stats_df['downsample_factor'].
            values,  # Must use '.values' here, otherwise
            index=mask_stats_df['segment'].values
        )  # index is used to read initial data.

        def insert_dsf(item):
            segment, boxes_and_masks = item
            downsample_factor = downsample_df[segment]
            return (segment, (downsample_factor, boxes_and_masks))

        masks_by_segment_id = masks_by_segment_id.map(insert_dsf)

        ##
        ## Filter out small segments and/or small bodies
        ##
        keep_col = mask_stats_df['keep_segment'] & mask_stats_df['keep_body']
        if not keep_col.all():
            # Note: This array will be broadcasted to the workers.
            #       It will be potentially quite large if we're keeping most (but not all) segments.
            #       Broadcast expense should be minimal thanks to lz4 compression,
            #       but RAM usage will be high.
            segments_to_keep = mask_stats_df['segment'][keep_col].values
            filtered_masks_by_segment_id = masks_by_segment_id.filter(
                lambda key_and_value: key_and_value[0] in segments_to_keep)
            persist_and_execute(filtered_masks_by_segment_id,
                                "Filtering masks by segment and size", logger)
            del masks_by_segment_id
            masks_by_segment_id = filtered_masks_by_segment_id

        # Aggregate
        # --> (segment_label, (box, mask, downsample_factor))
        segment_box_mask_factor = masks_by_segment_id.mapValues(
            partial(combine_masks, config))
        persist_and_execute(segment_box_mask_factor, "Assembling masks",
                            logger)

        #
        # Re-compute meshes once for every simplification ratio in the config
        #
        for instance_name, simplification_ratio in zip(
                self.mesh_instances, config["mesh-config"]["simplify-ratios"]):

            def _generate_mesh(box_mask_factor):
                box, mask, factor = box_mask_factor
                return generate_mesh(config, simplification_ratio, box, mask,
                                     factor)

            # --> (segment_label, (mesh_bytes, vertex_count))
            segments_meshes_counts = segment_box_mask_factor.mapValues(
                _generate_mesh)
            persist_and_execute(
                segments_meshes_counts,
                f"Computing meshes at decimation {simplification_ratio:.2f}",
                logger)

            with Timer("Computing mesh statistics", logger):
                mask_and_mesh_stats_df = self.append_mesh_stats(
                    mask_stats_df, segments_meshes_counts,
                    f'{simplification_ratio:.2f}')

            # Update the 'keep_body' column: Skip meshes that are too big.
            huge_bodies = (mask_and_mesh_stats_df['body_mesh_bytes'] > 1.9e9)
            if huge_bodies.any():
                logger.error(
                    "SOME BODY MESH GROUPS ARE TOO BIG TO PROCESS.  See dumped DataFrame for details."
                )
                mask_and_mesh_stats_df['keep_body'] &= ~huge_bodies

                # Drop them from the processing list
                segments_in_huge_bodies = mask_and_mesh_stats_df['segment'][
                    huge_bodies].values
                segments_meshes_counts = segments_meshes_counts.filter(
                    lambda seg_and_values: not (seg_and_values[0] in
                                                segments_in_huge_bodies))

            # --> (segment_label, mesh_bytes)
            def drop_vcount(item):
                segment_label, (mesh_bytes, _vertex_count) = item
                return (segment_label, mesh_bytes)

            segments_and_meshes = segments_meshes_counts.map(drop_vcount)

            # Group by body ID
            # --> ( body_id ( segment_label, mesh_bytes ) )
            grouped_body_ids_segments_meshes = self.group_by_body(
                segments_and_meshes)
            unpersist(segments_and_meshes)
            del segments_and_meshes

            unpersist(segments_meshes_counts)
            del segments_meshes_counts

            with Timer("Writing meshes to DVID", logger):
                grouped_body_ids_segments_meshes.foreachPartition(
                    partial(post_meshes_to_dvid, config, instance_name))

            unpersist(grouped_body_ids_segments_meshes)
            del grouped_body_ids_segments_meshes
Exemple #28
0
    def execute(self):
        self._sanitize_config()
        self._prepare_output()

        input_config = self.config["input"]["dvid"]
        output_config = self.config["output"]
        options = self.config["svdecimate"]
        resource_config = self.config["resource-manager"]

        resource_mgr_client = ResourceManagerClient(resource_config["server"], resource_config["port"])

        server = input_config["server"]
        uuid = input_config["uuid"]
        tsv_instance = input_config["tarsupervoxels-instance"]

        bodies = load_body_list(options["bodies"], False)

        # Determine segmentation instance
        info = fetch_instance_info(server, uuid, tsv_instance)
        input_format = info["Extended"]["Extension"]

        output_format = options["format"]

        if np.array(options["rescale"] == 1.0).all() and output_format == "ngmesh" and input_format != "ngmesh":
            logger.warning("*** You are converting to ngmesh format, but you have not specified a rescale parameter! ***")

        decimation_lib = options["decimation-library"]
        max_sv_vertices = options["max-sv-vertices"]
        max_body_vertices = options["max-body-vertices"]
        num_procs = options["processes-per-body"]

        def process_body(body_id):
            with resource_mgr_client.access_context( input_config["server"], True, 1, 0 ):
                tar_bytes = fetch_tarfile(server, uuid, tsv_instance, body_id)

            sv_meshes = Mesh.from_tarfile(tar_bytes, concatenate=False)
            sv_meshes = {int(os.path.splitext(name)[0]): m for name, m in sv_meshes.items()}

            total_body_vertices = sum([len(m.vertices_zyx) for m in sv_meshes.values()])
            decimation = min(1.0, max_body_vertices / total_body_vertices)

            try:
                _process_sv = partial(process_sv, decimation, decimation_lib, max_sv_vertices, output_format)
                if num_procs <= 1:
                    output_table = [*starmap(_process_sv, sv_meshes.items())]
                else:
                    output_table = compute_parallel(_process_sv, sv_meshes.items(), starmap=True, processes=num_procs, ordered=False, show_progress=False)

                cols = ['sv', 'orig_vertices', 'final_vertices', 'final_decimation', 'effective_decimation', 'mesh_bytes']
                output_df = pd.DataFrame(output_table, columns=cols)
                output_df['body'] = body_id
                output_df['error'] = ""
                write_sv_meshes(output_df, output_config, output_format, resource_mgr_client)
            except Exception as ex:
                svs = [*sv_meshes.keys()]
                orig_vertices = [len(m.vertices_zyx) for m in sv_meshes.values()]
                output_df = pd.DataFrame({'sv': svs, 'orig_vertices': orig_vertices})
                output_df['final_vertices'] = -1
                output_df['final_decimation'] = -1
                output_df['effective_decimation'] = -1
                output_df['mesh_bytes'] = -1
                output_df['body'] = body_id
                output_df['error'] = str(ex)

            return output_df.drop(columns=['mesh_bytes'])

        futures = self.client.map(process_body, bodies)

        # Support synchronous testing with a fake 'as_completed' object
        if hasattr(self.client, 'DEBUG'):
            ac = as_completed_synchronous(futures, with_results=True)
        else:
            ac = distributed.as_completed(futures, with_results=True)

        try:
            stats = []
            for f, r in tqdm_proxy(ac, total=len(futures)):
                stats.append(r)
                if (r['error'] != "").any():
                    body = r['body'].iloc[0]
                    logger.warning(f"Body {body} failed!")

        finally:
            stats_df = pd.concat(stats)
            stats_df.to_csv('mesh-stats.csv', index=False, header=True)
            with open('mesh-stats.pkl', 'wb') as f:
                pickle.dump(stats_df, f)
Exemple #29
0
    def execute(self):
        options = self.config["mitorepair"]
        mgr_config = self.config["resource-manager"]
        resource_mgr_client = ResourceManagerClient(mgr_config["server"],
                                                    mgr_config["port"])
        seg_service, mask_service = self.init_services(resource_mgr_client)

        labelmap_server = options["dvid-labelmap-size-src"]["server"]
        labelmap_uuid = options["dvid-labelmap-size-src"]["uuid"]
        labelmap_name = options["dvid-labelmap-size-src"]["segmentation-name"]
        if labelmap_server:
            assert labelmap_server and labelmap_uuid and labelmap_name, \
                "Invalid labelmap specification"
            body_seg_dvid_src = (labelmap_server, labelmap_uuid, labelmap_name)
        else:
            body_seg_dvid_src = None

        # Boxes are determined by the left volume/labels/roi
        chunk_shape = np.array(3 * (options["chunk-width-s0"], ))
        boxes = self.init_boxes(seg_service, options["roi"], chunk_shape)
        logger.info(f"Processing {len(boxes)} bricks in total.")

        with Timer(
                "Finding merges to repair mito fragmentation in the segmentation",
                logger):

            def process_box(central_box):
                fragment_table = mito_body_assignments_for_box(
                    seg_service,
                    mask_service,
                    central_box,
                    options["halo-width-s0"],
                    options["analysis-scale"],
                    body_seg_dvid_src,
                    resource_mgr_client=resource_mgr_client)
                return fragment_table

            # Compute block-wise, and drop empty results
            fragment_tables = db.from_sequence(
                boxes, partition_size=1).map(process_box).compute()
            fragment_tables = [
                *filter(lambda t: t is not None, fragment_tables)
            ]

        with Timer("Combining fragment tables", logger):
            combined_table = pd.concat(fragment_tables)
            with open('combined-fragment-table.pkl', 'wb') as f:
                pickle.dump(combined_table, f)

        with Timer("Selecting top merge for each mito body", logger):
            filtered_table = (combined_table[[
                'body_size_local_vol', 'hull_body'
            ]].query('hull_body != 0').sort_values(
                'body_size_local_vol',
                ascending=False).groupby('body').head(1))

        try:
            filtered_table = self.append_synapse_columns(
                filtered_table, options["neuprint"])
        except Exception as ex:
            logger.error(
                f"Was not able to append synapse data from neuprint:\n{ex}")

        if body_seg_dvid_src:
            with Timer("Fetching sizes from DVID"):
                hull_body_sizes = fetch_sizes(
                    *body_seg_dvid_src,
                    filtered_table['hull_body'].values,
                    processes=8)
                filtered_table['hull_body_size'] = hull_body_sizes.values
        else:
            # The 'body_size' column is misleading if it wasn't actually fetched from dvid.
            del filtered_table['body_size']

        with Timer("Writing unfiltered top choices", logger):
            with open('unfiltered-top-choices-table.pkl', 'wb') as f:
                pickle.dump(filtered_table, f)

        with Timer(
                "Filtering out ambiguously merged bodies and bodies with synapses",
                logger):
            # Some bodies might be identified as a "mito body" in one block and a "hull body" in another.
            # (In the hemibrain v1.1 dataset, this was the case for 0.06% of all identified mito fragments.)
            # Such cases typically occur in areas of bad segmentation where our repairs don't have much hope of helping anyway.
            # Furthermore, there's nothing that prevents the merge decisions (mito -> hull) from being cyclical in such cases.
            # We could "fix" the issue by coming up with a rule, e.g. force the smaller one to merge into the bigger one,
            # and then transitively merge until we get to a non-mito body, but that seems too complicated for such a
            # tiny fraction of cases.
            # So, just drop those rows.
            hull_bodies = filtered_table['hull_body'].unique()
            conflict_bodies = filtered_table.index.intersection(
                hull_bodies).unique()
            conflict_bodies
            filtered_table = filtered_table.query(
                'body not in @conflict_bodies')

            # We don't want to change the connectome. Drop mito bodies with synapses.
            if 'pre' in filtered_table.columns:
                filtered_table = filtered_table.query('pre == 0 and post == 0')

        with Timer("Writing final results", logger):
            filtered_table.to_csv('final-fragment-table.csv',
                                  header=True,
                                  index=True)
            with open('final-fragment-table.pkl', 'wb') as f:
                pickle.dump(filtered_table, f)

            if 'pre' in filtered_table.columns:
                # Also save in the format that can be loaded by a LabelmappedVolumeService
                (filtered_table.reset_index()[['body', 'hull_body'
                                               ]].rename(columns={
                                                   'body': 'orig',
                                                   'hull_body': 'new'
                                               }).to_csv('final-remapping.csv',
                                                         index=False,
                                                         header=True))
    def sparse_brick_coords_for_labels(self, labels, clip=True):
        """
        Return a DataFrame indicating the brick
        coordinates (starting corner) that encompass the given labels.

        Args:
            labels:
                A list of body IDs (if ``self.supervoxels`` is False),
                or supervoxel IDs (if ``self.supervoxels`` is True).

            clip:
                If True, filter the results to exclude any coordinates
                that fall outside this service's bounding-box.
                Otherwise, all brick coordinates that encompass the given labels
                will be returned, whether or not they fall within the bounding box.

        Returns:
            DataFrame with columns [z,y,x,label],
            where z,y,x represents the starting corner (in full-res coordinates)
            of a brick that contains the label.
        """
        assert not isinstance(labels,
                              set), "Pass labels as a list or array, not a set"
        labels = pd.unique(labels)
        is_supervoxels = self.supervoxels
        brick_shape = self.preferred_message_shape
        assert (brick_shape % self.block_width == 0).all(), \
            ("Brick shape ('preferred-message-shape') must be a multiple of the "
             f"block width ({self.block_width}) in all dimensions, not {brick_shape}")

        bad_labels = []

        if not is_supervoxels:
            # No supervoxel filtering.
            # Sort by body, since that should be slightly nicer for dvid performance.
            bodies_and_svs = {label: None for label in sorted(labels)}
        else:
            # Arbitrary heuristic for whether to do the body-lookups on DVID or on the client.
            if len(labels) < 100_000:
                # If we're only dealing with a few supervoxels,
                # ask dvid to map them to bodies for us.
                mapping = fetch_mapping(*self.instance_triple,
                                        labels,
                                        as_series=True)
            else:
                # If we're dealing with a lot of supervoxels, ask for
                # the entire mapping, and look up the bodies ourselves.
                complete_mapping = fetch_mappings(*self.instance_triple)
                mapper = LabelMapper(complete_mapping.index.values,
                                     complete_mapping.values)

                labels = np.asarray(labels, np.uint64)
                bodies = mapper.apply(labels, True)
                mapping = pd.Series(index=labels, data=bodies, name='body')
                mapping.index.rename('sv', inplace=True)

            bad_svs = mapping[mapping == 0]
            bad_labels.extend(bad_svs.index.tolist())

            # Group by body
            mapping = mapping[mapping != 0]
            grouped_svs = mapping.reset_index().groupby('body').agg(
                {'sv': list})['sv']

            # Sort by body, since that should be slightly nicer for dvid performance.
            bodies_and_svs = grouped_svs.sort_index().to_dict()

        # Extract these to avoid pickling 'self' (just for speed)
        server, uuid, instance = self.instance_triple
        if self._use_resource_manager_for_sparse_coords:
            mgr = self.resource_manager_client
        else:
            mgr = ResourceManagerClient("", 0)

        def fetch_brick_coords(body, supervoxel_subset):
            """
            Fetch the block coordinates for the given body,
            filter them for the given supervoxels (if any),
            and convert the block coordinates to brick coordinates.
            """
            assert is_supervoxels or supervoxel_subset is None

            try:
                with mgr.access_context(server, True, 1, 1):
                    labelindex = fetch_labelindex(server, uuid, instance, body,
                                                  'protobuf')
                coords_df = convert_labelindex_to_pandas(labelindex).blocks

            except HTTPError as ex:
                if (ex.response is not None
                        and ex.response.status_code == 404):
                    return (body, None)
                raise
            except RuntimeError as ex:
                if 'does not map to any body' in str(ex):
                    return (body, None)
                raise

            if len(coords_df) == 0:
                return (body, None)

            if is_supervoxels:
                supervoxel_subset = set(supervoxel_subset)
                coords_df = coords_df.query('sv in @supervoxel_subset').copy()

            coords_df[['z', 'y', 'x']] //= brick_shape
            coords_df['body'] = np.uint64(body)
            coords_df.drop_duplicates(inplace=True)
            return (body, coords_df)

        def fetch_and_concatenate_brick_coords(bodies_and_supervoxels):
            """
            To reduce the number of tiny DataFrames collected to the driver,
            it's best to concatenate the partitions first, on the workers,
            rather than a straightforward call to starmap(fetch_brick_coords).

            Hence, this function that consolidates each partition.
            """
            bad_bodies = []
            coord_dfs = []
            for (body, supervoxel_subset) in bodies_and_supervoxels:
                _, coords_df = fetch_brick_coords(body, supervoxel_subset)
                if coords_df is None:
                    bad_bodies.append(body)
                else:
                    coord_dfs.append(coords_df)
                    del coords_df

            if coord_dfs:
                return [(pd.concat(coord_dfs, ignore_index=True), bad_bodies)]
            else:
                return [(None, bad_bodies)]

        with Timer(
                f"Fetching coarse sparsevols for {len(labels)} labels ({len(bodies_and_svs)} bodies)",
                logger=logger):
            import dask.bag as db
            coords_and_bad_bodies = (
                db.from_sequence(
                    bodies_and_svs.items(), npartitions=4096
                )  # Instead of fancy heuristics, just pick 4096
                .map_partitions(fetch_and_concatenate_brick_coords).compute())

        coords_df_partitions, bad_body_partitions = zip(*coords_and_bad_bodies)

        for body in chain(*bad_body_partitions):
            if is_supervoxels:
                bad_labels.extend(bodies_and_svs[body])
            else:
                bad_labels.append(body)

        if bad_labels:
            name = 'sv' if is_supervoxels else 'body'
            pd.Series(bad_labels,
                      name=name).to_csv('labels-without-sparsevols.csv',
                                        index=False,
                                        header=True)
            if len(bad_labels) < 100:
                msg = f"Could not obtain coarse sparsevol for {len(bad_labels)} labels: {bad_labels}"
            else:
                msg = f"Could not obtain coarse sparsevol for {len(bad_labels)} labels. See labels-without-sparsevols.csv"

            logger.warning(msg)

        coords_df_partitions = list(
            filter(lambda df: df is not None, coords_df_partitions))
        if len(coords_df_partitions) == 0:
            raise RuntimeError(
                "Could not find bricks for any of the given labels")

        coords_df = pd.concat(coords_df_partitions, ignore_index=True)

        if self.supervoxels:
            coords_df['label'] = coords_df['sv']
        else:
            coords_df['label'] = coords_df['body']

        coords_df.drop_duplicates(['z', 'y', 'x', 'label'], inplace=True)
        coords_df[['z', 'y', 'x']] *= brick_shape

        if clip:
            # Keep if the last pixel in the brick is to the right of the bounding-box start
            # and the first pixel in the brick is to the left of the bounding-box stop
            keep = (coords_df[['z', 'y', 'x']] + brick_shape >
                    self.bounding_box_zyx[0]).all(axis=1)
            keep &= (coords_df[['z', 'y', 'x']] <
                     self.bounding_box_zyx[1]).all(axis=1)
            coords_df = coords_df.loc[keep]

        return coords_df[['z', 'y', 'x', 'label']]