Ejemplo n.º 1
0
    def parallelize_bounding_box( self,
                                  instance_name,
                                  bounding_box_zyx,
                                  grid,
                                  target_partition_size_voxels ):
        """
        Create an RDD for the given data instance (of either grayscale, labelblk, labelarray, or labelmap),
        within the given bounding_box (start_zyx, stop_zyx) and split into blocks of the given shape.
        The RDD parallelism will be set to include approximately target_partition_size_voxels in total.
        """
        block_size_voxels = np.prod(grid.block_shape)
        rdd_partition_length = target_partition_size_voxels // block_size_voxels

        bricks = generate_bricks_from_volume_source( bounding_box_zyx,
                                                     grid,
                                                     self.get_volume_accessor(instance_name),
                                                     self.sc,
                                                     rdd_partition_length )
        
        # If we're working with a tiny volume (e.g. testing),
        # make sure we at least parallelize across all cores.
        if bricks.getNumPartitions() < cpus_per_worker() * num_worker_nodes():
            bricks = bricks.repartition( cpus_per_worker() * num_worker_nodes() )

        return bricks
Ejemplo n.º 2
0
    def _sanitize_config(self):
        """
        Tidy up some config values, and fill in 'auto' values where needed.
        """
        input_config = self.config_data["input"]
        output_config = self.config_data["output"]
        options = self.config_data["options"]

        # Initialize dummy input/output services, just to overwrite 'auto' config values as needed.
        VolumeService.create_from_config(input_config, self.config_dir)

        # Output bounding-box must match exactly (or left as auto)
        input_bb_zyx = self.input_service.bounding_box_zyx
        output_bb_zyx = self.output_service.bounding_box_zyx
        assert ((output_bb_zyx == input_bb_zyx) | (output_bb_zyx == -1)).all(), \
            "Output bounding box must match the input bounding box exactly. (No translation permitted)."

        assert output_config["slice-files"]["slice-xy-offset"] == [
            0, 0
        ], "Nonzero xy offset is meaningless for outputs."

        if options["slices-per-slab"] == -1:
            # Auto-choose a depth that keeps all threads busy with at least one slice
            brick_shape_zyx = self.input_service.preferred_message_shape
            brick_depth = brick_shape_zyx[0]
            assert brick_depth != -1
            num_threads = num_worker_nodes() * cpus_per_worker()
            threads_per_brick_layer = (
                (num_threads + brick_depth - 1) // brick_depth)  # round up
            options["slices-per-slab"] = brick_depth * threads_per_brick_layer
    def _sanitize_config(self):
        """
        Tidy up some config values, and fill in 'auto' values where needed.
        """
        input_config = self.config_data["input"]
        output_config = self.config_data["output"]
        options = self.config_data["options"]

        # Initialize dummy input/output services, just to overwrite 'auto' config values as needed.
        VolumeService.create_from_config( input_config, self.config_dir )

        # Output bounding-box must match exactly (or left as auto)
        input_bb_zyx = self.input_service.bounding_box_zyx
        output_bb_zyx = self.output_service.bounding_box_zyx
        assert ((output_bb_zyx == input_bb_zyx) | (output_bb_zyx == -1)).all(), \
            "Output bounding box must match the input bounding box exactly. (No translation permitted)."

        assert output_config["slice-files"]["slice-xy-offset"] == [0,0], "Nonzero xy offset is meaningless for outputs."

        if options["slices-per-slab"] == -1:
            # Auto-choose a depth that keeps all threads busy with at least one slice
            brick_shape_zyx = self.input_service.preferred_message_shape
            brick_depth = brick_shape_zyx[0]
            assert brick_depth != -1
            num_threads = num_worker_nodes() * cpus_per_worker()
            threads_per_brick_layer = ((num_threads + brick_depth-1) // brick_depth) # round up
            options["slices-per-slab"] = brick_depth * threads_per_brick_layer
Ejemplo n.º 4
0
    def _execute_labelindices(self, mapping_df):
        config = self.config_data
        options = config["options"]
        resource_manager_client = ResourceManagerClient(
            options["resource-server"], options["resource-port"])

        last_mutid = options["mutation-id"]
        server = config["dvid"]["server"]
        uuid = config["dvid"]["uuid"]
        instance_name = config["dvid"]["segmentation-name"]
        endpoint = f'{server}/api/node/{uuid}/{instance_name}/indices'

        processor = StatsBatchProcessor(last_mutid, endpoint)

        # Load the h5 file
        block_sv_stats = load_stats_h5_to_records(config["block-stats-file"])

        # Note: Initializing this generator involves sorting the (very large) stats array
        batch_rows = options["batch-row-count"]
        batch_generator = generate_stats_batches(block_sv_stats, mapping_df,
                                                 batch_rows)

        batches = self.sc.parallelize(batch_generator,
                                      cpus_per_worker() * num_worker_nodes())
        rt.persist_and_execute(batches, "Distributing batches", logger)

        def process_batch(item):
            stats_batch, total_rows = item
            approximate_bytes = 30 * total_rows  # this is highly unscientific
            with resource_manager_client.access_context(
                    server, False, 1, approximate_bytes):
                processor.process_batch((stats_batch, total_rows))

        with Timer("Processing/sending batches", logger):
            batches.foreach(process_batch)
def compute_stats(block_shape, concurrent_threads, df):
    total_blocks = df['voxel_count'].sum() / np.prod(block_shape)
    stats = {
        "total-analyzed-requests": len(df),

        "num-workers": num_worker_nodes(),
        "available-threads": 16 * num_worker_nodes(),
        "concurrent-threads": concurrent_threads,
        "approx-requests-per-thread": len(df) / (concurrent_threads),

        "blocks-per-request": total_blocks / len(df),
        "seconds-per-request": df['seconds'].mean(),
        "seconds-per-block": df['seconds'].sum() / total_blocks,

        "wall-time": (df['timestamp'].iloc[-1] - df['timestamp'].iloc[0]).seconds + df['seconds'].iloc[-1],
    }
    return stats
Ejemplo n.º 6
0
    def group_by_body(self, segments_and_meshes):
        config = self.config_data

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":

            def last_six_digits(id_mesh):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id

            grouped_body_ids_and_meshes = segments_and_meshes.groupBy(
                last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = self.load_labelmap()

            def prepend_mapped_group_id(id_mesh_partition):
                df = pd.DataFrame(mapping_pairs,
                                  columns=["body_id", "group_id"])

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append((group_id, (body_id, mesh)))
                return new_partition

            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = segments_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = segments_and_meshes.map(
                lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]))

        persist_and_execute(
            grouped_body_ids_and_meshes,
            f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        return grouped_body_ids_and_meshes
Ejemplo n.º 7
0
    def from_accessor_func(cls, bounding_box, grid, volume_accessor_func=None, sc=None, target_partition_size_voxels=None, sparse_boxes=None, lazy=False):
        """
        Convenience constructor, taking an arbitrary volume_accessor_func.
        
        Args:
            bounding_box:
                (start, stop)
     
            grid:
                Grid (see brick.py)
     
            volume_accessor_func:
                Callable with signature: f(box) -> ndarray
                Note: The callable will be unpickled only once per partition, so initialization
                      costs after unpickling are only incurred once per partition.
     
            sc:
                SparkContext. If provided, an RDD is returned.  Otherwise, returns an ordinary Python iterable.
     
            target_partition_size_voxels:
                Optional. If provided, the RDD partition lengths (i.e. the number of bricks per RDD partition)
                will be chosen to have (approximately) this many total voxels in each partition.
            
            sparse_boxes:
                A list of (physical) sparse boxes indicating which bricks should actually be present in the BrickWall.
                If not provided, all bricks within the bounding_box will be present. 

            lazy:
                If True, the bricks' data will not be created until their 'volume' member is first accessed.
        """
        if target_partition_size_voxels is None:
            if sc:
                num_threads = num_worker_nodes() * cpus_per_worker()
            else:
                # See RDDtools -- for now, non-spark pseudo-RDDs are just a single partition.
                num_threads = 1

            if sparse_boxes is None:
                total_voxels = np.prod(bounding_box[1] - bounding_box[0])
            else:
                if not hasattr(sparse_boxes, '__len__'):
                    sparse_boxes = list(sparse_boxes)
                total_voxels = sum( map(lambda physbox: np.prod(physbox[1] - physbox[0]), sparse_boxes ) )
            
            voxels_per_thread = total_voxels / num_threads
            target_partition_size_voxels = (voxels_per_thread // 2) # Arbitrarily aim for 2 partitions per thread

        block_size_voxels = np.prod(grid.block_shape)
        rdd_partition_length = target_partition_size_voxels // block_size_voxels

        bricks = generate_bricks_from_volume_source(bounding_box, grid, volume_accessor_func, sc, rdd_partition_length, sparse_boxes, lazy)
        return BrickWall( bounding_box, grid, bricks )
Ejemplo n.º 8
0
def compute_stats(block_shape, concurrent_threads, df):
    total_blocks = df['voxel_count'].sum() / np.prod(block_shape)
    stats = {
        "total-analyzed-requests":
        len(df),
        "num-workers":
        num_worker_nodes(),
        "available-threads":
        16 * num_worker_nodes(),
        "concurrent-threads":
        concurrent_threads,
        "approx-requests-per-thread":
        len(df) / (concurrent_threads),
        "blocks-per-request":
        total_blocks / len(df),
        "seconds-per-request":
        df['seconds'].mean(),
        "seconds-per-block":
        df['seconds'].sum() / total_blocks,
        "wall-time":
        (df['timestamp'].iloc[-1] - df['timestamp'].iloc[0]).seconds +
        df['seconds'].iloc[-1],
    }
    return stats
    def group_by_body(self, segments_and_meshes):
        config = self.config_data

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":
            def last_six_digits( id_mesh ):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id
            grouped_body_ids_and_meshes = segments_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = self.load_labelmap()

            def prepend_mapped_group_id( id_mesh_partition ):
                df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] )

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append( (group_id, (body_id, mesh)) )
                return new_partition
            
            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = segments_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = segments_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) )

        persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        return grouped_body_ids_and_meshes
    def _process_slab(self, scale, slab_fullres_box_zyx, slab_index, num_slabs, upscale_slab_wall):
        num_threads = num_worker_nodes() * cpus_per_worker()
        slab_voxels = np.prod(slab_fullres_box_zyx[1] - slab_fullres_box_zyx[0]) // (2**scale)**3
        voxels_per_thread = slab_voxels // num_threads

        options = self.config_data["options"]
        pyramid_source = options["pyramid-source"]
        
        if pyramid_source == "copy" or scale == 0:
            # Copy from input source
            bricked_slab_wall = BrickWall.from_volume_service(self.input_service, scale, slab_fullres_box_zyx, self.sc, voxels_per_thread // 2)
            bricked_slab_wall.persist_and_execute(f"Slab {slab_index}: Downloading scale {scale}", logger)
        else:
            # Downsample from previous scale
            bricked_slab_wall = upscale_slab_wall.downsample( (2,2,2), 'grayscale' )
            bricked_slab_wall.persist_and_execute(f"Slab {slab_index}: Downsampling to scale {scale}", logger)
            upscale_slab_wall.unpersist()
            del upscale_slab_wall

        if scale == 0:
            bricked_slab_wall = self.adjust_contrast(bricked_slab_wall, slab_index)
        
        # Remap to output bricks
        output_grid = Grid(self.output_service.preferred_message_shape)
        output_slab_wall = bricked_slab_wall.realign_to_new_grid( output_grid )
        
        # Pad from previously-existing pyramid data until
        # we have full storage blocks, e.g. (64,64,64),
        # but not necessarily full bricks, e.g. (64,64,6400)
        output_accessor_func = partial(self.output_service.get_subvolume, scale=scale)

        # But don't bother fetching real data for scale 0
        # the input slabs are already block-aligned, and the edges of each slice will be zeros anyway.
        if scale == 0:
            output_accessor_func = lambda _box: 0

        padding_grid = Grid( 3*(self.output_service.block_width,), output_grid.offset )
        padded_slab_wall = output_slab_wall.fill_missing(output_accessor_func, padding_grid)
        padded_slab_wall.persist_and_execute(f"Slab {slab_index}: Assembling scale {scale} bricks", logger)

        # Discard original bricks
        bricked_slab_wall.unpersist()
        del bricked_slab_wall

        logger.info(f"Slab {slab_index}: Writing scale {scale}", extra={"status": f"Writing {slab_index}/{num_slabs}"})
        rt.foreach( partial(write_brick, self.output_service, scale), padded_slab_wall.bricks )

        return padded_slab_wall
Ejemplo n.º 11
0
    def run_on_each_worker(self, func):
        """
        Run the given function once per worker node.
        """
        status_filepath = '/tmp/' + self._execution_uuid + '-' + str(self._worker_task_id)
        self._worker_task_id += 1
        
        @self.collect_log(lambda i: socket.gethostname() + '[' + func.__name__ + ']')
        def task_f(i):
            with FileLock(status_filepath):
                if os.path.exists(status_filepath):
                    return None
                
                # create empty file to indicate the task was executed
                open(status_filepath, 'w')

            result = func()
            return (socket.gethostname(), result)

        num_workers = num_worker_nodes()
        
        # It would be nice if we only had to schedule N tasks for N workers,
        # but we couldn't ensure that tasks are hashed 1-to-1 onto workers.
        # Instead, we'll schedule **LOTS** of extra tasks, but the logic in
        # task_f() will skip the unnecessary work.
        num_tasks = num_workers * 1000

        # Execute the tasks.  Returns [(hostname, result), None, None, (hostname, result), ...],
        # with 'None' interspersed for hosts that were hit multiple times.
        # (Each host only returns a single non-None result)
        host_results = self.sc.parallelize(list(range(num_tasks)), num_tasks)\
                            .repartition(num_tasks).map(task_f).collect()
        host_results = [_f for _f in host_results if _f] # Drop Nones
        
        host_results = dict(host_results)

        assert len(host_results) == num_workers, \
            "Task '{}' was not executed all workers ({}), or some tasks failed! Nodes processed: \n{}"\
            .format(func.__name__, num_workers, host_results)
        logger.info("Ran {} on {} nodes: {}".format(func.__name__, len(host_results), host_results))
        return host_results
Ejemplo n.º 12
0
    def _execute_mesh_generation(self, large_id_box_mask_factor_err):
        config = self.config_data

        @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS')
        def logged_generate_mesh(arg):
            return generate_mesh_in_subprocess(config, arg)

        #     --> (body_id, mesh_bytes, error_msg)
        body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map(
            logged_generate_mesh)
        persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes",
                            logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log.
        errors = body_ids_and_meshes_with_err.map(
            lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Filter out error cases
        body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \
                                                          .map( lambda id_mesh_err: id_mesh_err[:2] )

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":

            def last_six_digits(id_mesh):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id

            grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(
                last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = load_labelmap(
                config["mesh-config"]["storage"]["labelmap"], self.config_dir)

            def prepend_mapped_group_id(id_mesh_partition):
                df = pd.DataFrame(mapping_pairs,
                                  columns=["body_id", "group_id"])

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append((group_id, (body_id, mesh)))
                return new_partition

            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = body_ids_and_meshes.map(
                lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]))

        persist_and_execute(
            grouped_body_ids_and_meshes,
            f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        unpersist(body_ids_and_meshes)
        del body_ids_and_meshes

        with Timer() as timer:
            grouped_body_ids_and_meshes.foreachPartition(
                partial(post_meshes_to_dvid, config))
        logger.info(f"Writing meshes to DVID took {timer.seconds}")
Ejemplo n.º 13
0
    def execute(self):
        self._init_services()
        self._sanitize_config()

        options = self.config_data["options"]

        output_service = self.output_service
        logger.info(f"Output bounding box: {output_service.bounding_box_zyx[:,::-1]}")

        # Data is processed in Z-slabs
        slab_depth = options["slices-per-slab"]

        input_bb_zyx = self.input_service.bounding_box_zyx
        _, slice_start_y, slice_start_x = input_bb_zyx[0]

        slab_shape_zyx = input_bb_zyx[1] - input_bb_zyx[0]
        slab_shape_zyx[0] = slab_depth

        slice_shape_zyx = slab_shape_zyx.copy()
        slice_shape_zyx[0] = 1

        # This grid outlines the slabs -- each grid box is a full slab
        slab_grid = Grid(slab_shape_zyx, (0, slice_start_y, slice_start_x))
        slab_boxes = list(clipped_boxes_from_grid(input_bb_zyx, slab_grid))

        for slab_index, slab_box_zyx in enumerate(slab_boxes):
            # Contruct BrickWall from input bricks
            num_threads = num_worker_nodes() * cpus_per_worker()
            slab_voxels = np.prod(slab_box_zyx[1] - slab_box_zyx[0])
            voxels_per_thread = slab_voxels / num_threads

            bricked_slab_wall = BrickWall.from_volume_service(self.input_service, 0, slab_box_zyx, self.sc, voxels_per_thread / 2)

            # Force download
            bricked_slab_wall.persist_and_execute(f"Downloading slab {slab_index}/{len(slab_boxes)}: {slab_box_zyx[:,::-1]}", logger)
            
            # Remap to slice-sized "bricks"
            sliced_grid = Grid(slice_shape_zyx, offset=slab_box_zyx[0])
            sliced_slab_wall = bricked_slab_wall.realign_to_new_grid( sliced_grid )
            sliced_slab_wall.persist_and_execute(f"Assembling slab {slab_index}/{len(slab_boxes)} slices", logger)

            # Discard original bricks
            bricked_slab_wall.unpersist()
            del bricked_slab_wall

            def write_slice(brick):
                assert (brick.physical_box == brick.logical_box).all()
                output_service.write_subvolume(brick.volume, brick.physical_box[0])

            # Export to PNG or TIFF, etc. (automatic via slice path extension)
            with Timer() as timer:
                logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)}", extra={"status": f"Exporting {slab_index}/{len(slab_boxes)}"})
                rt.foreach( write_slice, sliced_slab_wall.bricks )
            logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)} took {timer.timedelta}",
                        extra={"status": f"Done: {slab_index}/{len(slab_boxes)}"})
            
            # Discard slice data
            sliced_slab_wall.unpersist()
            del sliced_slab_wall

        logger.info(f"DONE exporting {len(slab_boxes)} slabs.", extra={'status': "DONE"})
Ejemplo n.º 14
0
def generate_bricks_from_volume_source( bounding_box, grid, volume_accessor_func, sc=None, rdd_partition_length=None, sparse_boxes=None, lazy=False ):
    """
    Generate an RDD or iterable of Bricks for the given bounding box and grid.
     
    Args:
        bounding_box:
            (start, stop)
 
        grid:
            Grid (see above)
 
        volume_accessor_func:
            Callable with signature: f(box) -> ndarray
            Note: The callable will be unpickled only once per partition, so initialization
                  costs after unpickling are only incurred once per partition.
 
        sc:
            SparkContext. If provided, an RDD is returned.  Otherwise, returns an ordinary Python iterable.
 
        rdd_partition_length:
            Optional. If provided, the RDD will have (approximately) this many bricks per partition.
        
        sparse_boxes:
            Optional.
            A pre-calculated list of boxes to use instead of instead of calculating
            the complete (dense) list of grid boxes within the bounding box.
            If provided, should be a list of physical boxes, and no two should occupy
            the same logical box, as defined by their midpoints.
            Note: They will still be clipped to the overall bounding_box.
        
        halo: An integer or shape indicating how much halo to add to each Brick's physical_box.
              The halo is applied in both 'dense' and 'sparse' cases.
    """
    if sparse_boxes is None:
        # Generate boxes from densely populated grid
        logical_boxes = boxes_from_grid(bounding_box, grid, include_halos=False)
        physical_boxes = clipped_boxes_from_grid(bounding_box, grid)
        logical_and_physical_boxes = zip( logical_boxes, physical_boxes )
    else:
        # User provided list of physical boxes.
        # Clip them to the bounding box and calculate the logical boxes.
        if not hasattr(sparse_boxes, '__len__'):
            sparse_boxes = list( sparse_boxes )
        physical_boxes = np.asarray( sparse_boxes )
        assert physical_boxes.ndim == 3 and physical_boxes.shape[1:3] == (2,3)

        def logical_and_clipped( box ):
            midpoint = (box[0] + box[1]) // 2
            logical_box = grid.compute_logical_box( midpoint )
            box += (-grid.halo_shape, grid.halo_shape)
            # Note: Non-intersecting boxes will have non-positive shape after clipping
            clipped_box = box_intersection(box, bounding_box)
            return ( logical_box, clipped_box )

        logical_and_physical_boxes = map(logical_and_clipped, physical_boxes)

        # Drop any boxes that fall completely outside the bounding box
        # Check that physical box doesn't completely fall outside its logical_box
        def is_valid(logical_and_physical):
            logical_box, physical_box = logical_and_physical
            return (physical_box[1] > logical_box[0]).all() and (physical_box[0] < logical_box[1]).all()
        logical_and_physical_boxes = filter(is_valid, logical_and_physical_boxes )

    if sc:
        if not hasattr(logical_and_physical_boxes, '__len__'):
            logical_and_physical_boxes = list(logical_and_physical_boxes) # need len()

        num_rdd_partitions = None
        if rdd_partition_length is not None:
            rdd_partition_length = max(1, rdd_partition_length)
            num_rdd_partitions = int( np.ceil( len(logical_and_physical_boxes) / rdd_partition_length ) )

        # If we're working with a tiny volume (e.g. testing),
        # make sure we at least parallelize across all cores.
        if num_rdd_partitions is not None and (num_rdd_partitions < cpus_per_worker() * num_worker_nodes()):
            num_rdd_partitions = cpus_per_worker() * num_worker_nodes()

        def brick_size(log_phys):
            _logical, physical = log_phys
            return np.uint64(np.prod(physical[1] - physical[0]))
        total_volume = sum(map(brick_size, logical_and_physical_boxes))
        logger.info(f"Initializing RDD of {len(logical_and_physical_boxes)} Bricks "
                    f"(over {num_rdd_partitions} partitions) with total volume {total_volume/1e9:.1f} Gvox")

        # Enumerate and repartition to get perfect partition sizes,
        # rather than relying on spark's default hash
        class _enumerated_value(tuple):
            # Return a hash based on the key alone.
            def __hash__(self):
                return self[0]

        enumerated_logical_and_physical_boxes = sc.parallelize( enumerate(logical_and_physical_boxes), num_rdd_partitions )
        enumerated_logical_and_physical_boxes = enumerated_logical_and_physical_boxes.map(_enumerated_value)
        enumerated_logical_and_physical_boxes = enumerated_logical_and_physical_boxes.partitionBy(num_rdd_partitions, lambda x: x)
        logical_and_physical_boxes = enumerated_logical_and_physical_boxes.values()

    def make_bricks( logical_and_physical_box ):
        logical_box, physical_box = logical_and_physical_box
        if lazy:
            return Brick(logical_box, physical_box, lazy_creation_fn=volume_accessor_func)
        else:
            volume = volume_accessor_func(physical_box)
            return Brick(logical_box, physical_box, volume)
    
    bricks = rt.map( make_bricks, logical_and_physical_boxes )
    return bricks
Ejemplo n.º 15
0
    def execute(self):
        self._init_services()
        self._sanitize_config()

        options = self.config_data["options"]

        output_service = self.output_service
        logger.info(
            f"Output bounding box: {output_service.bounding_box_zyx[:,::-1]}")

        # Data is processed in Z-slabs
        slab_depth = options["slices-per-slab"]

        input_bb_zyx = self.input_service.bounding_box_zyx
        _, slice_start_y, slice_start_x = input_bb_zyx[0]

        slab_shape_zyx = input_bb_zyx[1] - input_bb_zyx[0]
        slab_shape_zyx[0] = slab_depth

        slice_shape_zyx = slab_shape_zyx.copy()
        slice_shape_zyx[0] = 1

        # This grid outlines the slabs -- each grid box is a full slab
        slab_grid = Grid(slab_shape_zyx, (0, slice_start_y, slice_start_x))
        slab_boxes = list(clipped_boxes_from_grid(input_bb_zyx, slab_grid))

        for slab_index, slab_box_zyx in enumerate(slab_boxes):
            # Contruct BrickWall from input bricks
            num_threads = num_worker_nodes() * cpus_per_worker()
            slab_voxels = np.prod(slab_box_zyx[1] - slab_box_zyx[0])
            voxels_per_thread = slab_voxels / num_threads

            bricked_slab_wall = BrickWall.from_volume_service(
                self.input_service, 0, slab_box_zyx, self.sc,
                voxels_per_thread / 2)

            # Force download
            bricked_slab_wall.persist_and_execute(
                f"Downloading slab {slab_index}/{len(slab_boxes)}: {slab_box_zyx[:,::-1]}",
                logger)

            # Remap to slice-sized "bricks"
            sliced_grid = Grid(slice_shape_zyx, offset=slab_box_zyx[0])
            sliced_slab_wall = bricked_slab_wall.realign_to_new_grid(
                sliced_grid)
            sliced_slab_wall.persist_and_execute(
                f"Assembling slab {slab_index}/{len(slab_boxes)} slices",
                logger)

            # Discard original bricks
            bricked_slab_wall.unpersist()
            del bricked_slab_wall

            def write_slice(brick):
                assert (brick.physical_box == brick.logical_box).all()
                output_service.write_subvolume(brick.volume,
                                               brick.physical_box[0])

            # Export to PNG or TIFF, etc. (automatic via slice path extension)
            with Timer() as timer:
                logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)}",
                            extra={
                                "status":
                                f"Exporting {slab_index}/{len(slab_boxes)}"
                            })
                rt.foreach(write_slice, sliced_slab_wall.bricks)
            logger.info(
                f"Exporting slab {slab_index}/{len(slab_boxes)} took {timer.timedelta}",
                extra={"status": f"Done: {slab_index}/{len(slab_boxes)}"})

            # Discard slice data
            sliced_slab_wall.unpersist()
            del sliced_slab_wall

        logger.info(f"DONE exporting {len(slab_boxes)} slabs.",
                    extra={'status': "DONE"})
    def execute(self):
        from pyspark import StorageLevel

        self._sanitize_config()
        config = self.config_data
        options = config["options"]

        resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"])
        total_cpus = 16 * num_worker_nodes()
        
        concurrent_threads = total_cpus
        if options["resource-server"]:
            concurrent_threads = options["resource-server-config"]["read_reqs"]
            if concurrent_threads > total_cpus:
                msg = "You're attempting to use the resource manager to constrain concurrency, but you "\
                      "aren't running with a large enough cluster to saturate the resource manager settings"
                raise RuntimeError(msg)

        # We instantiate a VolumeService as an easy way to plug in missing config values as necessary.
        # (We won't actually use it.)
        volume_service = VolumeService.create_from_config(config["input"], self.config_dir)

        server = volume_service.server
        uuid = volume_service.uuid
        instance = volume_service.instance_name
        block_shape = 3*(volume_service.block_width,)

        def timed_fetch_blocks_from_box(box):
            """
            Fetch the blocks for a given box and return the time it took to fetch them.
            Do not bother decompressing the blocks or combining them into a single volume.
            """
            assert not (box % block_shape).any(), "For this test, all requests must be block-aligned"
            block_boxes = list( boxes_from_grid(box, Grid(block_shape)) )
            block_coords_xyz = np.array(block_boxes)[:,0,::-1] // block_shape
            block_coords_str = ','.join(map(str, block_coords_xyz.flat))

            voxel_count = np.prod(box[1] - box[0])

            session = default_dvid_session()
            url = f'{server}/api/node/{uuid}/{instance}/specificblocks?blocks={block_coords_str}'
            
            with resource_mgr_client.access_context(server, True, 1, voxel_count):
                timestamp = datetime.now()
                with Timer() as timer:
                    r = session.get(url)
            
            r.raise_for_status()
            return timestamp, voxel_count, len(r.content), timer.seconds

        # This hash-related hackery is to ensure uniform partition lengths, which Spark is bad at by default.
        boxes = list(clipped_boxes_from_grid( volume_service.bounding_box_zyx, Grid(volume_service.preferred_message_shape) ))
        indexed_boxes = list(map(rt.tuple_with_hash, (enumerate(boxes))))
        for i_box in indexed_boxes:
            i_box.set_hash(i_box[0])

        rdd_boxes = self.sc.parallelize(indexed_boxes).values()
        timestamps_voxels_sizes_times = rdd_boxes.map(timed_fetch_blocks_from_box)
        
        # The only reason I'm persisting this is to see the partition distribution in the log.
        rt.persist_and_execute(timestamps_voxels_sizes_times, "Fetching blocks", logger, StorageLevel.MEMORY_ONLY) #@UndefinedVariable

        # Execute the workload
        timestamps, voxels, sizes, times = zip( *timestamps_voxels_sizes_times.collect() )
        
        # Process the results
        self.dump_stats(timestamps, voxels, sizes, times, block_shape, concurrent_threads)
    def _execute_mesh_generation(self, large_id_box_mask_factor_err):
        config = self.config_data
        @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS')
        def logged_generate_mesh(arg):
            return generate_mesh_in_subprocess(config, arg)
        
        #     --> (body_id, mesh_bytes, error_msg)
        body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map( logged_generate_mesh )
        persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes", logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log. 
        errors = body_ids_and_meshes_with_err.map(lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Filter out error cases
        body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \
                                                          .map( lambda id_mesh_err: id_mesh_err[:2] )
                                                          
        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":
            def last_six_digits( id_mesh ):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id
            grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = load_labelmap( config["mesh-config"]["storage"]["labelmap"], self.config_dir )

            def prepend_mapped_group_id( id_mesh_partition ):
                df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] )

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append( (group_id, (body_id, mesh)) )
                return new_partition
            
            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = body_ids_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) )

        persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        unpersist(body_ids_and_meshes)
        del body_ids_and_meshes
        
        with Timer() as timer:
            grouped_body_ids_and_meshes.foreachPartition( partial(post_meshes_to_dvid, config) )
        logger.info(f"Writing meshes to DVID took {timer.seconds}")
Ejemplo n.º 18
0
    def execute(self):
        from pyspark import StorageLevel

        self._sanitize_config()
        config = self.config_data
        options = config["options"]

        resource_mgr_client = ResourceManagerClient(options["resource-server"],
                                                    options["resource-port"])
        total_cpus = 16 * num_worker_nodes()

        concurrent_threads = total_cpus
        if options["resource-server"]:
            concurrent_threads = options["resource-server-config"]["read_reqs"]
            if concurrent_threads > total_cpus:
                msg = "You're attempting to use the resource manager to constrain concurrency, but you "\
                      "aren't running with a large enough cluster to saturate the resource manager settings"
                raise RuntimeError(msg)

        # We instantiate a VolumeService as an easy way to plug in missing config values as necessary.
        # (We won't actually use it.)
        volume_service = VolumeService.create_from_config(
            config["input"], self.config_dir)

        server = volume_service.server
        uuid = volume_service.uuid
        instance = volume_service.instance_name
        block_shape = 3 * (volume_service.block_width, )

        def timed_fetch_blocks_from_box(box):
            """
            Fetch the blocks for a given box and return the time it took to fetch them.
            Do not bother decompressing the blocks or combining them into a single volume.
            """
            assert not (box % block_shape).any(
            ), "For this test, all requests must be block-aligned"
            block_boxes = list(boxes_from_grid(box, Grid(block_shape)))
            block_coords_xyz = np.array(block_boxes)[:, 0, ::-1] // block_shape
            block_coords_str = ','.join(map(str, block_coords_xyz.flat))

            voxel_count = np.prod(box[1] - box[0])

            session = default_dvid_session()
            url = f'{server}/api/node/{uuid}/{instance}/specificblocks?blocks={block_coords_str}'

            with resource_mgr_client.access_context(server, True, 1,
                                                    voxel_count):
                timestamp = datetime.now()
                with Timer() as timer:
                    r = session.get(url)

            r.raise_for_status()
            return timestamp, voxel_count, len(r.content), timer.seconds

        # This hash-related hackery is to ensure uniform partition lengths, which Spark is bad at by default.
        boxes = list(
            clipped_boxes_from_grid(
                volume_service.bounding_box_zyx,
                Grid(volume_service.preferred_message_shape)))
        indexed_boxes = list(map(rt.tuple_with_hash, (enumerate(boxes))))
        for i_box in indexed_boxes:
            i_box.set_hash(i_box[0])

        rdd_boxes = self.sc.parallelize(indexed_boxes).values()
        timestamps_voxels_sizes_times = rdd_boxes.map(
            timed_fetch_blocks_from_box)

        # The only reason I'm persisting this is to see the partition distribution in the log.
        rt.persist_and_execute(timestamps_voxels_sizes_times,
                               "Fetching blocks", logger,
                               StorageLevel.MEMORY_ONLY)  #@UndefinedVariable

        # Execute the workload
        timestamps, voxels, sizes, times = zip(
            *timestamps_voxels_sizes_times.collect())

        # Process the results
        self.dump_stats(timestamps, voxels, sizes, times, block_shape,
                        concurrent_threads)