Esempio n. 1
0
def export_supervoxel_stats(h5_path, output_csv_path, delimiter=' '):
    block_sv_stats = load_stats_h5_to_records(h5_path, False)

    with Timer(f"Sorting {len(block_sv_stats)} block stats", logger):
        block_sv_stats.sort(order=['segment_id', 'z', 'y', 'x', 'count'])

    with Timer(f"Converting coordinates to block indexes", logger):
        _convert_coords_to_block_indexes(block_sv_stats)

    _export_csv(block_sv_stats, output_csv_path)
Esempio n. 2
0
def persist_and_execute(rdd, description, logger=None, storage=None):
    """
    Persist and execute the given RDD or iterable.
    The persisted RDD is returned (in the case of an iterable, it may not be the original)
    """
    if logger:
        logger.info(f"{description}...")

    with Timer() as timer:
        if isinstance(rdd, _RDD):
            if storage is None:
                from pyspark import StorageLevel
                storage = StorageLevel.MEMORY_ONLY

            rdd.persist(storage)
            count = rdd.count()  # force eval
            parts = rdd.getNumPartitions()
            partition_counts = rdd.mapPartitions(
                lambda part: [sum(1 for _ in part)]).collect()
            histogram = defaultdict(lambda: 0)
            for c in partition_counts:
                histogram[c] += 1
            histogram = dict(histogram)
        else:
            rdd = list(rdd)  # force eval and 'persist' in a new list
            count = len(rdd)
            parts = 1
            histogram = {count: 1}

    if logger:
        logger.info(
            f"{description} (N={count}, P={parts}, P_hist={histogram}) took {timer.timedelta}"
        )

    return rdd
Esempio n. 3
0
def export_body_stats(h5_path, mapping_csv, output_csv_path, delimiter=' '):
    mapping_pairs = load_edge_csv(mapping_csv)
    segment_to_body_df = pd.DataFrame(mapping_pairs,
                                      columns=['segment_id', 'body_id'])

    block_sv_stats = load_stats_h5_to_records(h5_path, True)
    _overwrite_body_id_column(block_sv_stats, segment_to_body_df)

    with Timer(f"Sorting {len(block_sv_stats)} block stats", logger):
        block_sv_stats.sort(
            order=['body_id', 'segment_id', 'z', 'y', 'x', 'count'])

    with Timer(f"Converting coordinates to block indexes", logger):
        _convert_coords_to_block_indexes(block_sv_stats)

    _export_csv(block_sv_stats, output_csv_path)
Esempio n. 4
0
    def _execute_labelindices(self, mapping_df):
        config = self.config_data
        options = config["options"]
        resource_manager_client = ResourceManagerClient(
            options["resource-server"], options["resource-port"])

        last_mutid = options["mutation-id"]
        server = config["dvid"]["server"]
        uuid = config["dvid"]["uuid"]
        instance_name = config["dvid"]["segmentation-name"]
        endpoint = f'{server}/api/node/{uuid}/{instance_name}/indices'

        processor = StatsBatchProcessor(last_mutid, endpoint)

        # Load the h5 file
        block_sv_stats = load_stats_h5_to_records(config["block-stats-file"])

        # Note: Initializing this generator involves sorting the (very large) stats array
        batch_rows = options["batch-row-count"]
        batch_generator = generate_stats_batches(block_sv_stats, mapping_df,
                                                 batch_rows)

        batches = self.sc.parallelize(batch_generator,
                                      cpus_per_worker() * num_worker_nodes())
        rt.persist_and_execute(batches, "Distributing batches", logger)

        def process_batch(item):
            stats_batch, total_rows = item
            approximate_bytes = 30 * total_rows  # this is highly unscientific
            with resource_manager_client.access_context(
                    server, False, 1, approximate_bytes):
                processor.process_batch((stats_batch, total_rows))

        with Timer("Processing/sending batches", logger):
            batches.foreach(process_batch)
Esempio n. 5
0
def _export_csv(stats, output_csv_path):
    if os.path.exists(output_csv_path):
        os.unlink(output_csv_path)

    with Timer(f"Writing sorted stats to {output_csv_path}", logger):
        chunk_size = 10_000_000
        for row_start in tqdm(range(0, len(stats), chunk_size)):
            row_stop = min(row_start + chunk_size, len(stats))
            df = pd.DataFrame(stats[row_start:row_stop])
            df.to_csv(output_csv_path,
                      sep=' ',
                      header=False,
                      index=False,
                      mode='a')
Esempio n. 6
0
    def _execute_mappings(self, mapping_df):
        config = self.config_data
        if mapping_df is None:
            raise RuntimeError(
                "Can't load mappings: No agglomeration mapping provided.")

        # Just do this from a single machine (the driver), with a big batch size
        # The writes are serialized on the DVID side, anyway.
        with Timer("Sending mapping", logger):
            ingest_mapping(config["dvid"]["server"],
                           config["dvid"]["uuid"],
                           config["dvid"]["segmentation-name"],
                           config["options"]["mutation-id"],
                           mapping_df,
                           batch_size=100_000,
                           show_progress_bar=False,
                           session=default_dvid_session())
def skeletonize(config, body_id, combined_box, combined_mask,
                downsample_factor):
    (combined_box_start, _combined_box_stop) = combined_box

    # This config factor is an option to artificially scale the meshes up before
    # writing them, on top of whatever amount the data was downsampled.
    rescale_factor = config["options"]["rescale-before-write"]
    downsample_factor *= rescale_factor
    combined_box = combined_box * rescale_factor

    with Timer() as timer:
        # FIXME: Should the skeleton-config be tweaked in any way based on the downsample_factor??
        tree = skeletonize_array(combined_mask, config["skeleton-config"])
        tree.rescale(downsample_factor, downsample_factor, downsample_factor,
                     True)
        tree.translate(*combined_box_start.astype(
            np.float64)[::-1])  # Pass x,y,z, not z,y,x

    del combined_mask

    swc_contents = "# {:%Y-%m-%d %H:%M:%S}\n".format(datetime.now())
    swc_contents += "# Generated by the DVIDSparkServices 'CreateSkeletons' workflow.\n"
    swc_contents += f"# (Skeletonization time: {timer.seconds}):\n"
    swc_contents += "# Workflow configuration:\n"
    swc_contents += "# \n"

    # Also show which downsample factor was actually chosen
    config_copy = copy.deepcopy(config)
    config_copy["options"]["(final-downsample-factor)"] = downsample_factor

    config_comment = json_dumps(config_copy,
                                sort_keys=True,
                                indent=4,
                                separators=(',', ': '))
    config_comment = "\n".join("# " + line
                               for line in config_comment.split("\n"))
    config_comment += "\n\n"

    swc_contents += config_comment + tree.toString()

    del tree

    return (body_id, swc_contents)  # No error
    def _execute_skeletonization(self, large_id_box_mask_factor_err):
        config = self.config_data

        @self.collect_log(lambda _: '_SKELETONIZATION_ERRORS')
        def logged_skeletonize(arg):
            return skeletonize_in_subprocess(config, arg)

        #     --> (body_id, swc_contents, error_msg)
        body_ids_and_skeletons = large_id_box_mask_factor_err.map(
            logged_skeletonize)
        persist_and_execute(body_ids_and_skeletons, "Computing skeletons",
                            logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log.
        errors = body_ids_and_skeletons.map(
            lambda id_swc_err: id_swc_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Write
        with Timer() as timer:
            body_ids_and_skeletons.foreachPartition(
                partial(post_swcs_to_dvid, config))
        logger.info(f"Writing skeletons to DVID took {timer.seconds}")
        def timed_fetch_blocks_from_box(box):
            """
            Fetch the blocks for a given box and return the time it took to fetch them.
            Do not bother decompressing the blocks or combining them into a single volume.
            """
            assert not (box % block_shape).any(
            ), "For this test, all requests must be block-aligned"
            block_boxes = list(boxes_from_grid(box, Grid(block_shape)))
            block_coords_xyz = np.array(block_boxes)[:, 0, ::-1] // block_shape
            block_coords_str = ','.join(map(str, block_coords_xyz.flat))

            voxel_count = np.prod(box[1] - box[0])

            session = default_dvid_session()
            url = f'{server}/api/node/{uuid}/{instance}/specificblocks?blocks={block_coords_str}'

            with resource_mgr_client.access_context(server, True, 1,
                                                    voxel_count):
                timestamp = datetime.now()
                with Timer() as timer:
                    r = session.get(url)

            r.raise_for_status()
            return timestamp, voxel_count, len(r.content), timer.seconds
Esempio n. 10
0
        def writeimagepyramid(part_data):
            logger = logging.getLogger(__name__)
            part, vol = part_data
            offset = part.get_offset()
            zslice = offset.z
            from PIL import Image
            from scipy import ndimage
            import io
            s = default_dvid_session()

            # pad data with delimiter if needed
            timslice = vol[0, :, :]
            shiftx = offset.x % tilesize
            shifty = offset.y % tilesize
            tysize, txsize = timslice.shape
            ysize = tysize + shifty
            xsize = txsize + shiftx
            imslice = np.zeros((ysize, xsize))
            imslice[:, :] = delimiter
            imslice[shifty:ysize, shiftx:xsize] = timslice
            curry = (offset.y - shifty) // 2
            currx = (offset.x - shiftx) // 2

            imlevels = []
            tileoffsetyx = []
            imlevels.append(imslice)
            tileoffsetyx.append((offset.y // tilesize, offset.x // tilesize))

            with Timer() as downsample_timer:
                # use generic downsample algorithm
                for level in range(1, maxlevel + 1):

                    tysize, txsize = imlevels[level - 1].shape

                    shiftx = currx % tilesize
                    shifty = curry % tilesize

                    ysize = tysize + shifty
                    xsize = txsize + shiftx
                    imslice = np.zeros((ysize, xsize))
                    imslice[:, :] = delimiter
                    timslice = ndimage.interpolation.zoom(
                        imlevels[level - 1], 0.5)
                    imslice[shifty:ysize, shiftx:xsize] = timslice
                    imlevels.append(imslice)
                    tileoffsetyx.append((currx // tilesize, curry // tilesize))

                    curry = (curry - shifty) // 2
                    currx = (currx - shiftx) // 2

            logger.info("Downsampled {} levels in {:.3f} seconds".format(
                maxlevel, downsample_timer.seconds))

            # write tile pyramid using custom requests
            for levelnum in range(0, len(imlevels)):
                levelslice = imlevels[levelnum]
                dim1, dim2 = levelslice.shape

                num1tiles = (dim1 - 1) // tilesize + 1
                num2tiles = (dim2 - 1) // tilesize + 1

                with Timer() as post_timer:
                    for iter1 in range(0, num1tiles):
                        for iter2 in range(0, num2tiles):
                            # extract tile
                            tileholder = np.zeros((tilesize, tilesize),
                                                  np.uint8)
                            tileholder[:, :] = delimiter
                            min1 = iter1 * tilesize
                            min2 = iter2 * tilesize
                            tileslice = levelslice[min1:min1 + tilesize,
                                                   min2:min2 + tilesize]
                            t1, t2 = tileslice.shape
                            tileholder[0:t1, 0:t2] = tileslice

                            starty, startx = tileoffsetyx[levelnum]
                            starty += iter1
                            startx += iter2
                            if createtiles:
                                buf = BytesIO()
                                img = Image.frombuffer('L',
                                                       (tilesize, tilesize),
                                                       tileholder.tostring(),
                                                       'raw', 'L', 0, 1)
                                img.save(buf, format="png")

                                urlreq = server + "/api/node/" + uuid + "/" + tilename + "/tile/xy/" + str(
                                    levelnum) + "/" + str(startx) + "_" + str(
                                        starty) + "_" + str(zslice)
                                s.post(urlreq, data=buf.getvalue())
                                buf.close()

                            if createtilesjpeg:
                                buf = BytesIO()
                                img = Image.frombuffer('L',
                                                       (tilesize, tilesize),
                                                       tileholder.tostring(),
                                                       'raw', 'L', 0, 1)
                                img.save(buf, format="jpeg")

                                urlreq = server + "/api/node/" + uuid + "/" + tilenamejpeg + "/tile/xy/" + str(
                                    levelnum) + "/" + str(startx) + "_" + str(
                                        starty) + "_" + str(zslice)
                                s.post(urlreq, data=buf.getvalue())
                                buf.close()
                logger.info("Posted {} tiles (level={}) in {} seconds".format(
                    num1tiles * num2tiles, levelnum, post_timer.seconds))
Esempio n. 11
0
        def write_blocks(part_vol):
            logger = logging.getLogger(__name__)
            part, data = part_vol
            offset = part.get_offset()
            reloffset = part.get_reloffset()
            _, _, x_size = data.shape
            if x_size % blksize != 0:
                # check if padded
                raise ValueError("Data is not block aligned")

            shiftedoffset = (offset.z + reloffset.z, offset.y + reloffset.y,
                             offset.x + reloffset.x)
            logger.info("Starting WRITE of partition at: {} size: {}".format(
                shiftedoffset, data.shape))
            node_service = retrieve_node_service(server, uuid, resource_server,
                                                 resource_port, appname)

            # Find all non-zero blocks (and record by block index)
            block_coords = []
            for block_index, block_x in enumerate(range(0, x_size, blksize)):
                if not (data[:, :, block_x:block_x + blksize]
                        == delimiter).all():
                    block_coords.append(
                        (0, 0, block_index
                         ))  # (Don't care about Z,Y indexes, just X-index)

            # Find *runs* of non-zero blocks
            block_runs = runlength_encode(
                block_coords, True)  # returns [[Z,Y,X1,X2], [Z,Y,X1,X2], ...]

            # Convert stop indexes from inclusive to exclusive
            block_runs[:, -1] += 1

            # Discard Z,Y indexes and convert from indexes to pixels
            ranges = blksize * block_runs[:, 2:4]

            # iterate through contiguous blocks and write to DVID
            # TODO: write compressed data directly into DVID
            for (data_x_start, data_x_end) in ranges:
                with Timer() as copy_timer:
                    datacrop = data[:, :, data_x_start:data_x_end].copy()
                logger.info("Copied {}:{} in {:.3f} seconds".format(
                    data_x_start, data_x_end, copy_timer.seconds))

                data_offset_zyx = (shiftedoffset[0], shiftedoffset[1],
                                   shiftedoffset[2] + data_x_start)

                if dataname is not None:
                    with Timer() as put_timer:
                        if not israw:
                            logger.info("STARTING Put: labels block {}".format(
                                data_offset_zyx))
                            if resource_server != "" or dvid_info[
                                    "dvid-server"].startswith(
                                        "http://127.0.0.1"):
                                node_service.put_labels3D(dataname,
                                                          datacrop,
                                                          data_offset_zyx,
                                                          compress=True,
                                                          throttle=False)
                            else:
                                node_service.put_labels3D(dataname,
                                                          datacrop,
                                                          data_offset_zyx,
                                                          compress=True)
                        else:
                            logger.info("STARTING Put: raw block {}".format(
                                data_offset_zyx))
                            if resource_server != "" or dvid_info[
                                    "dvid-server"].startswith(
                                        "http://127.0.0.1"):
                                node_service.put_gray3D(dataname,
                                                        datacrop,
                                                        data_offset_zyx,
                                                        compress=False,
                                                        throttle=False)
                            else:
                                node_service.put_gray3D(dataname,
                                                        datacrop,
                                                        data_offset_zyx,
                                                        compress=False)
                    logger.info("Put block {} in {:.3f} seconds".format(
                        data_offset_zyx, put_timer.seconds))

                if dataname_lossy is not None:
                    logger.info(
                        "STARTING Put: lossy block {}".format(data_offset_zyx))
                    with Timer() as put_lossy_timer:
                        if resource_server != "" or dvid_info[
                                "dvid-server"].startswith("http://127.0.0.1"):
                            node_service.put_gray3D(dataname_lossy,
                                                    datacrop,
                                                    data_offset_zyx,
                                                    compress=False,
                                                    throttle=False)
                        else:
                            node_service.put_gray3D(dataname_lossy,
                                                    datacrop,
                                                    data_offset_zyx,
                                                    compress=False)
                    logger.info("Put lossy block {} in {:.3f} seconds".format(
                        data_offset_zyx, put_lossy_timer.seconds))
Esempio n. 12
0
    def execute(self):
        self._init_services()
        self._sanitize_config()

        options = self.config_data["options"]

        output_service = self.output_service
        logger.info(
            f"Output bounding box: {output_service.bounding_box_zyx[:,::-1]}")

        # Data is processed in Z-slabs
        slab_depth = options["slices-per-slab"]

        input_bb_zyx = self.input_service.bounding_box_zyx
        _, slice_start_y, slice_start_x = input_bb_zyx[0]

        slab_shape_zyx = input_bb_zyx[1] - input_bb_zyx[0]
        slab_shape_zyx[0] = slab_depth

        slice_shape_zyx = slab_shape_zyx.copy()
        slice_shape_zyx[0] = 1

        # This grid outlines the slabs -- each grid box is a full slab
        slab_grid = Grid(slab_shape_zyx, (0, slice_start_y, slice_start_x))
        slab_boxes = list(clipped_boxes_from_grid(input_bb_zyx, slab_grid))

        for slab_index, slab_box_zyx in enumerate(slab_boxes):
            # Contruct BrickWall from input bricks
            num_threads = num_worker_nodes() * cpus_per_worker()
            slab_voxels = np.prod(slab_box_zyx[1] - slab_box_zyx[0])
            voxels_per_thread = slab_voxels / num_threads

            bricked_slab_wall = BrickWall.from_volume_service(
                self.input_service, 0, slab_box_zyx, self.sc,
                voxels_per_thread / 2)

            # Force download
            bricked_slab_wall.persist_and_execute(
                f"Downloading slab {slab_index}/{len(slab_boxes)}: {slab_box_zyx[:,::-1]}",
                logger)

            # Remap to slice-sized "bricks"
            sliced_grid = Grid(slice_shape_zyx, offset=slab_box_zyx[0])
            sliced_slab_wall = bricked_slab_wall.realign_to_new_grid(
                sliced_grid)
            sliced_slab_wall.persist_and_execute(
                f"Assembling slab {slab_index}/{len(slab_boxes)} slices",
                logger)

            # Discard original bricks
            bricked_slab_wall.unpersist()
            del bricked_slab_wall

            def write_slice(brick):
                assert (brick.physical_box == brick.logical_box).all()
                output_service.write_subvolume(brick.volume,
                                               brick.physical_box[0])

            # Export to PNG or TIFF, etc. (automatic via slice path extension)
            with Timer() as timer:
                logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)}",
                            extra={
                                "status":
                                f"Exporting {slab_index}/{len(slab_boxes)}"
                            })
                rt.foreach(write_slice, sliced_slab_wall.bricks)
            logger.info(
                f"Exporting slab {slab_index}/{len(slab_boxes)} took {timer.timedelta}",
                extra={"status": f"Done: {slab_index}/{len(slab_boxes)}"})

            # Discard slice data
            sliced_slab_wall.unpersist()
            del sliced_slab_wall

        logger.info(f"DONE exporting {len(slab_boxes)} slabs.",
                    extra={'status': "DONE"})
Esempio n. 13
0
    def _execute_mesh_generation(self, large_id_box_mask_factor_err):
        config = self.config_data

        @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS')
        def logged_generate_mesh(arg):
            return generate_mesh_in_subprocess(config, arg)

        #     --> (body_id, mesh_bytes, error_msg)
        body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map(
            logged_generate_mesh)
        persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes",
                            logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log.
        errors = body_ids_and_meshes_with_err.map(
            lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Filter out error cases
        body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \
                                                          .map( lambda id_mesh_err: id_mesh_err[:2] )

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":

            def last_six_digits(id_mesh):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id

            grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(
                last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = load_labelmap(
                config["mesh-config"]["storage"]["labelmap"], self.config_dir)

            def prepend_mapped_group_id(id_mesh_partition):
                df = pd.DataFrame(mapping_pairs,
                                  columns=["body_id", "group_id"])

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append((group_id, (body_id, mesh)))
                return new_partition

            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = body_ids_and_meshes.map(
                lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]))

        persist_and_execute(
            grouped_body_ids_and_meshes,
            f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        unpersist(body_ids_and_meshes)
        del body_ids_and_meshes

        with Timer() as timer:
            grouped_body_ids_and_meshes.foreachPartition(
                partial(post_meshes_to_dvid, config))
        logger.info(f"Writing meshes to DVID took {timer.seconds}")
Esempio n. 14
0
def compute_comparison_mapping_table(old_edges, new_edges, sv_sizes=None):
    """
    Given two agglomeration encoded via old_edges and new_edges
    (in which vertex IDs correspond to supervoxel IDs),
    compute the connected components for both graphs,
    and also the CC of their graph intersection.
    
    Returns the mapping from SV to body (CC) for all three graphs as a pd.DataFrame.
    Each body ID is defined as the minimum SV ID in the body, so of course there
    will be no correspondence between body IDs in the different mappings.
    
    If sv_sizes is provided, the size of each supervoxel is appended as a column in the DataFrame.
    Any supervoxel IDs missing from sv_sizes ("phantom" supervoxels) are presumed to be of size 0.
    
    Note:
        For simply comparing segmentations (regardless of internal merge topology),
        this function may not be what you want.
        Consider two 3-node graphs: A-B-C and B-C-A.
        Those two graphs yield identical segmentations (i.e. a single component),
        but their graph intersection yields two components (A and B-C).
        Consider using a different function, which returns a set-based result.
        That is, compute the CC for the 'old' graph, the CC for the 'new' graph,
        and then simply enumerate the unique body pairs in the resulting table.

    Args:
        old_edges: ndarray, shape (N,2)
        
        new_edges: ndarray, shape (M,2)
        
        sv_sizes: (Optional)
                  Must be a pd.Series as returned by load_supervoxel_sizes(),
                  i.e. sv is the index and size is the value.
    Returns:
        pd.DataFrame, indexed by sv with columns:
        "old_body", "new_body", "intersection_component", and "voxel_count" (if sv_sizes was provided)
    """
    # We require C-order arrays, since we'll be fiddling with dtype views that change the shape of the arrays.
    # https://mail.scipy.org/pipermail/numpy-svn/2015-December/007404.html
    old_edges = old_edges.astype(np.uint64, order='C', copy=False)
    new_edges = new_edges.astype(np.uint64, order='C', copy=False)

    # Edges must be pre-normalized
    assert (old_edges[:, 0] <= old_edges[:, 1]).all()
    assert (new_edges[:, 0] <= new_edges[:, 1]).all()

    with Timer("Removing duplicate edges", logger):
        # Pre-sorting should speed up drop_duplicates()
        old_edges.view([('u', np.uint64), ('v', np.uint64)]).sort()
        new_edges.view([('u', np.uint64), ('v', np.uint64)]).sort()

        old_edges = pd.DataFrame(old_edges,
                                 copy=False).drop_duplicates().values
        new_edges = pd.DataFrame(new_edges,
                                 copy=False).drop_duplicates().values

    with Timer("Computing intersection", logger):
        all_edges = np.concatenate((old_edges, new_edges))
        all_edges.view([('u', np.uint64), ('v', np.uint64)]).sort()
        duplicate_markers = pd.DataFrame(all_edges,
                                         copy=False).duplicated().values
        common_edges = all_edges[duplicate_markers]
        del all_edges

    with Timer("Ensuring identical SV sets", logger):
        old_svs = set(pd.unique(old_edges.flat))
        new_svs = set(pd.unique(new_edges.flat))
        common_svs = set(pd.unique(common_edges.flat))

        # Append identity rows for SVs missing from either graph
        missing_from_old = np.fromiter(new_svs.union(common_svs) - old_svs,
                                       dtype=np.uint64)
        missing_from_new = np.fromiter(old_svs.union(common_svs) - new_svs,
                                       dtype=np.uint64)
        missing_from_common = np.fromiter(new_svs.union(old_svs) - common_svs,
                                          dtype=np.uint64)

        if len(missing_from_old) > 0:
            old_missing_edges = np.concatenate(
                (missing_from_old[:, None], missing_from_old[:, None]), axis=1)
            old_edges = np.concatenate((old_edges, old_missing_edges))

        if len(missing_from_new) > 0:
            new_missing_edges = np.concatenate(
                (missing_from_new[:, None], missing_from_new[:, None]), axis=1)
            new_edges = np.concatenate((new_edges, new_missing_edges))

        if len(missing_from_common) > 0:
            common_missing_edges = np.concatenate(
                (missing_from_common[:, None], missing_from_common[:, None]),
                axis=1)
            common_edges = np.concatenate((common_edges, common_missing_edges))

    with Timer("Computing old mapping", logger):
        old_mapping = mapping_from_edges(old_edges,
                                         sort_by='segment',
                                         as_series=True)

    with Timer("Computing new mapping", logger):
        new_mapping = mapping_from_edges(new_edges,
                                         sort_by='segment',
                                         as_series=True)

    with Timer("Computing intersection mapping", logger):
        intersection_mapping = mapping_from_edges(common_edges,
                                                  sort_by='segment',
                                                  as_series=True)

    assert len(old_mapping.index) == len(new_mapping.index) == len(
        intersection_mapping.index)

    sv_table = pd.DataFrame(
        {
            "old_body": old_mapping,
            "new_body": new_mapping,
            "intersection_component": intersection_mapping
        },
        copy=False)

    sv_table.index = sv_table.index.astype(np.uint64, copy=False)
    sv_table.index.name = "sv"

    if sv_sizes is not None:
        with Timer("Appending supervoxel sizes", logger):
            sv_table = sv_table.merge(pd.DataFrame(sv_sizes),
                                      'left',
                                      left_index=True,
                                      right_index=True,
                                      copy=False)

            # Fix 'phantom' supervoxels (mentioned in the merge graph(s), but not present in the volume)
            sv_table['voxel_count'].fillna(0, inplace=True)
            sv_table['voxel_count'] = sv_table['voxel_count'].astype(np.uint64)

    # Force correct dtypes
    sv_table['old_body'] = sv_table['old_body'].astype(np.uint64)
    sv_table['new_body'] = sv_table['new_body'].astype(np.uint64)
    sv_table['intersection_component'] = sv_table[
        'intersection_component'].astype(np.uint64)
    return sv_table
Esempio n. 15
0
    def execute(self):
        import pandas as pd
        self._sanitize_config()

        config = self.config_data
        options = config["options"]

        resource_mgr_client = ResourceManagerClient(options["resource-server"],
                                                    options["resource-port"])
        volume_service = VolumeService.create_from_config(
            config["dvid-info"], self.config_dir, resource_mgr_client)

        self._init_meshes_instances()

        # Aim for 2 GB RDD partitions
        GB = 2**30
        target_partition_size_voxels = 2 * GB // np.uint64().nbytes

        # This will return None if we're not using sparse blocks
        sparse_block_mask = self._get_sparse_block_mask(volume_service)

        brick_wall = BrickWall.from_volume_service(
            volume_service, 0, None, self.sc, target_partition_size_voxels,
            sparse_block_mask)
        brick_wall.persist_and_execute("Downloading segmentation", logger)

        # brick -> [ (segment_label, (box, mask, count)),
        #            (segment_label, (box, mask, count)), ... ]
        segments_and_masks = brick_wall.bricks.map(
            partial(compute_segment_masks, config))
        persist_and_execute(segments_and_masks,
                            "Computing brick-local segment masks", logger)
        brick_wall.unpersist()
        del brick_wall

        with Timer("Computing segment statistics", logger):
            mask_stats_df = self.compute_mask_stats(segments_and_masks)

        # Flatten now, AFTER stats have been computed
        # (compute_mask_stats() requires that the RDDs not have duplicate labels in them.)
        # While we're at it, drop the count (not needed any more)
        # --> (segment_label, (box, mask))
        def drop_count(items):
            new_items = []
            for item in items:
                segment_label, (box, mask, _count) = item
                new_items.append((segment_label, (box, mask)))
            return new_items

        segments_and_masks = segments_and_masks.flatMap(drop_count)

        bad_segments = mask_stats_df[[
            'segment', 'compressed_bytes'
        ]].query('compressed_bytes > 1.9e9')['segment']
        if len(bad_segments) > 0:
            logger.error(
                f"SOME SEGMENTS (N={len(bad_segments)}) ARE TOO BIG TO PROCESS.  Skipping segments: {list(bad_segments)}."
            )
            segments_and_masks = segments_and_masks.filter(
                lambda seg_mask: seg_mask[0] not in bad_segments.values)

        # (segment, (box, mask))
        #   --> (segment, boxes_and_masks)
        #   === (segment, [(box, mask), (box, mask), (box, mask), ...])
        masks_by_segment_id = segments_and_masks.groupByKey()
        persist_and_execute(masks_by_segment_id,
                            "Grouping segment masks by segment label ID",
                            logger)
        segments_and_masks.unpersist()
        del segments_and_masks

        # Insert chosen downsample_factor (a.k.a. dsf)
        #   --> (segment, dsf_and_boxes_and_masks)
        #   === (segment, (downsample_factor, [(box, mask), (box, mask), (box, mask), ...]))
        downsample_df = pd.Series(
            mask_stats_df['downsample_factor'].
            values,  # Must use '.values' here, otherwise
            index=mask_stats_df['segment'].values
        )  # index is used to read initial data.

        def insert_dsf(item):
            segment, boxes_and_masks = item
            downsample_factor = downsample_df[segment]
            return (segment, (downsample_factor, boxes_and_masks))

        masks_by_segment_id = masks_by_segment_id.map(insert_dsf)

        ##
        ## Filter out small segments and/or small bodies
        ##
        keep_col = mask_stats_df['keep_segment'] & mask_stats_df['keep_body']
        if not keep_col.all():
            # Note: This array will be broadcasted to the workers.
            #       It will be potentially quite large if we're keeping most (but not all) segments.
            #       Broadcast expense should be minimal thanks to lz4 compression,
            #       but RAM usage will be high.
            segments_to_keep = mask_stats_df['segment'][keep_col].values
            filtered_masks_by_segment_id = masks_by_segment_id.filter(
                lambda key_and_value: key_and_value[0] in segments_to_keep)
            persist_and_execute(filtered_masks_by_segment_id,
                                "Filtering masks by segment and size", logger)
            del masks_by_segment_id
            masks_by_segment_id = filtered_masks_by_segment_id

        # Aggregate
        # --> (segment_label, (box, mask, downsample_factor))
        segment_box_mask_factor = masks_by_segment_id.mapValues(
            partial(combine_masks, config))
        persist_and_execute(segment_box_mask_factor, "Assembling masks",
                            logger)

        #
        # Re-compute meshes once for every simplification ratio in the config
        #
        for instance_name, simplification_ratio in zip(
                self.mesh_instances, config["mesh-config"]["simplify-ratios"]):

            def _generate_mesh(box_mask_factor):
                box, mask, factor = box_mask_factor
                return generate_mesh(config, simplification_ratio, box, mask,
                                     factor)

            # --> (segment_label, (mesh_bytes, vertex_count))
            segments_meshes_counts = segment_box_mask_factor.mapValues(
                _generate_mesh)
            persist_and_execute(
                segments_meshes_counts,
                f"Computing meshes at decimation {simplification_ratio:.2f}",
                logger)

            with Timer("Computing mesh statistics", logger):
                mask_and_mesh_stats_df = self.append_mesh_stats(
                    mask_stats_df, segments_meshes_counts,
                    f'{simplification_ratio:.2f}')

            # Update the 'keep_body' column: Skip meshes that are too big.
            huge_bodies = (mask_and_mesh_stats_df['body_mesh_bytes'] > 1.9e9)
            if huge_bodies.any():
                logger.error(
                    "SOME BODY MESH GROUPS ARE TOO BIG TO PROCESS.  See dumped DataFrame for details."
                )
                mask_and_mesh_stats_df['keep_body'] &= ~huge_bodies

                # Drop them from the processing list
                segments_in_huge_bodies = mask_and_mesh_stats_df['segment'][
                    huge_bodies].values
                segments_meshes_counts = segments_meshes_counts.filter(
                    lambda seg_and_values: not (seg_and_values[0] in
                                                segments_in_huge_bodies))

            # --> (segment_label, mesh_bytes)
            def drop_vcount(item):
                segment_label, (mesh_bytes, _vertex_count) = item
                return (segment_label, mesh_bytes)

            segments_and_meshes = segments_meshes_counts.map(drop_vcount)

            # Group by body ID
            # --> ( body_id ( segment_label, mesh_bytes ) )
            grouped_body_ids_segments_meshes = self.group_by_body(
                segments_and_meshes)
            unpersist(segments_and_meshes)
            del segments_and_meshes

            unpersist(segments_meshes_counts)
            del segments_meshes_counts

            with Timer("Writing meshes to DVID", logger):
                grouped_body_ids_segments_meshes.foreachPartition(
                    partial(post_meshes_to_dvid, config, instance_name))

            unpersist(grouped_body_ids_segments_meshes)
            del grouped_body_ids_segments_meshes
Esempio n. 16
0
    def compute_mask_stats(self, segments_and_masks):
        """
        segments_and_masks: RDD wher each element is of the form:
                            (label, (box, mask, count))
                             AND labels within a partition are UNIQUE.
        """
        config = self.config_data

        # In DataFrames, bounding box is stored as 6 int columns instead
        # of 1 'object' column for easier joins, combines, serialization, etc.
        BB_COLS = ['z0', 'y0', 'x0', 'z1', 'y1', 'x1']
        STATS_COLUMNS = ['segment', 'segment_voxel_count', 'compressed_bytes'
                         ] + BB_COLS

        def stats_df_for_masks(segments_and_masks):
            """
            Convert the list of elements, each in the form: (segment, (box, compressed_mask, count))
            into a pandas DataFrame.
            
            Note: This function assumes that there are no duplicate segments in the list.
                  Therefore, it must be called only with the list of masks from a single 'brick'.
            """
            import pandas as pd
            pd.set_option('expand_frame_repr', False)

            # Each item is (segment, (box, compressed_mask, count))
            bounding_boxes = [
                object_info[1][0] for object_info in segments_and_masks
            ]

            item_df = pd.DataFrame(columns=STATS_COLUMNS)
            item_df['segment'] = [
                object_info[0] for object_info in segments_and_masks
            ]
            item_df['compressed_bytes'] = [
                object_info[1][1].compressed_nbytes
                for object_info in segments_and_masks
            ]
            item_df['segment_voxel_count'] = [
                object_info[1][2] for object_info in segments_and_masks
            ]
            item_df[BB_COLS] = np.array(bounding_boxes).reshape(-1, 6)

            return item_df

        def merge_stats(left, right):
            import pandas as pd
            pd.set_option('expand_frame_repr', False)

            # Join the two DFs and replace missing values with appropriate defaults
            joined = left.merge(right,
                                'outer',
                                on='segment',
                                suffixes=('_left', '_right'),
                                copy=False)
            fillna_inplace(joined, np.inf, ['z0_left', 'y0_left', 'x0_left'])
            fillna_inplace(joined, np.inf,
                           ['z0_right', 'y0_right', 'x0_right'])
            fillna_inplace(joined, -np.inf, ['z1_left', 'y1_left', 'x1_left'])
            fillna_inplace(joined, -np.inf,
                           ['z1_right', 'y1_right', 'x1_right'])
            fillna_inplace(
                joined, 0,
                ['segment_voxel_count_left', 'segment_voxel_count_right'])
            fillna_inplace(joined, 0,
                           ['compressed_bytes_left', 'compressed_bytes_right'])

            # Now that the data is aligned by segment label, combine corresponding columns
            result = pd.DataFrame({'segment': joined['segment']})
            result['segment_voxel_count'] = joined[
                'segment_voxel_count_left'] + joined[
                    'segment_voxel_count_right']
            result['compressed_bytes'] = joined[
                'compressed_bytes_left'] + joined['compressed_bytes_right']
            result[['z0', 'y0', 'x0']] = np.minimum(
                joined[['z0_left', 'y0_left', 'x0_left']],
                joined[['z0_right', 'y0_right', 'x0_right']])
            result[['z1', 'y1', 'x1']] = np.maximum(
                joined[['z1_left', 'y1_left', 'x1_left']],
                joined[['z1_right', 'y1_right', 'x1_right']])
            assert set(result.columns) == set(STATS_COLUMNS)

            return result

        # Calculate segment (a.k.a. supervoxel) stats
        full_stats_df = segments_and_masks.map(stats_df_for_masks).treeReduce(
            merge_stats, depth=4)

        # Convert column types (float64 was used above to handle NaNs, but now we can convert back to int)
        convert_dtype_inplace(full_stats_df, np.uint64,
                              ['segment_voxel_count', 'compressed_bytes'])
        convert_dtype_inplace(
            full_stats_df, np.int64, BB_COLS
        )  # int32 is dangerous because multiplying them together quickly overflows

        full_stats_df['box_size'] = full_stats_df.eval(
            '(z1 - z0)*(y1 - y0)*(x1 - x0)')
        full_stats_df['keep_segment'] = (
            full_stats_df['segment_voxel_count'] >=
            config['options']['minimum-segment-size'])
        full_stats_df['keep_segment'] &= (
            full_stats_df['segment_voxel_count'] <=
            config['options']['maximum-segment-size'])

        max_analysis_voxels = config['options']['max-analysis-volume']

        # Chosen dowsnsample factor is max of user's minimum and auto-minimum
        full_stats_df['downsample_factor'] = 1 + np.power(
            full_stats_df['box_size'].values / max_analysis_voxels,
            (1. / 3)).astype(np.int16)
        full_stats_df['downsample_factor'] = np.maximum(
            full_stats_df['downsample_factor'],
            config['options']['minimum-downsample-factor'])

        # Convert to uint8 to save RAM (will be broadcasted to workers)
        assert full_stats_df['downsample_factor'].max() < 256
        full_stats_df['downsample_factor'] = full_stats_df[
            'downsample_factor'].astype(np.uint8)
        assert full_stats_df['downsample_factor'].dtype == np.uint8

        ##
        ## If grouping segments into bodies (for tarballs),
        ## also append body stats
        ##
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        if grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = self.load_labelmap()

            # Add body column
            segment_to_body_df = pd.DataFrame(mapping_pairs,
                                              columns=['segment', 'body'])
            full_stats_df = full_stats_df.merge(segment_to_body_df,
                                                'left',
                                                on='segment',
                                                copy=False)

            # Missing segments in the labelmap are assumed to be identity-mapped
            full_stats_df['body'].fillna(full_stats_df['segment'],
                                         inplace=True)
            full_stats_df['body'] = full_stats_df['body'].astype(np.uint64)

            # Calculate body voxel sizes
            body_stats_df = full_stats_df[[
                'body', 'segment_voxel_count'
            ]].groupby('body').agg(['size', 'sum'])
            body_stats_df.columns = ['body_segment_count', 'body_voxel_count']
            body_stats_df['body'] = body_stats_df.index

            full_stats_df = full_stats_df.merge(body_stats_df,
                                                'left',
                                                on='body',
                                                copy=False)

            if config["options"]["force-uniform-downsampling"]:
                body_downsample_factors = full_stats_df[[
                    'body', 'downsample_factor'
                ]].groupby('body', as_index=False).max()
                adjusted_downsample_factors = full_stats_df[['body']].merge(
                    body_downsample_factors, 'left', on='body')
                full_stats_df[
                    'downsample_factor'] = adjusted_downsample_factors[
                        'downsample_factor'].astype(np.uint8)

            # For offline analysis, write body stats to a file
            output_path = self.config_dir + '/body-stats.csv'
            logger.info(f"Saving body statistics to {output_path}")
            body_stats_df = body_stats_df[[
                'body', 'body_segment_count', 'body_voxel_count'
            ]]  # Set col order
            body_stats_df.columns = ['body', 'segment_count',
                                     'voxel_count']  # rename columns for csv
            body_stats_df.sort_values('voxel_count',
                                      ascending=False,
                                      inplace=True)
            body_stats_df.to_csv(output_path, header=True, index=False)

        else:
            # Not grouping -- Just duplicate segment stats into body columns
            full_stats_df['body'] = full_stats_df['segment']
            full_stats_df['body_voxel_count'] = full_stats_df[
                'segment_voxel_count']

        full_stats_df['keep_body'] = (
            (full_stats_df['body_voxel_count'] >=
             config['options']['minimum-agglomerated-size']) &
            (full_stats_df['body_voxel_count'] <=
             config['options']['maximum-agglomerated-size']))

        # If subset-bodies were given, exclude all others.
        sparse_body_ids = config["mesh-config"]["storage"]["subset-bodies"]
        if sparse_body_ids:
            for body_id in sparse_body_ids:
                if not full_stats_df[full_stats_df['body'] ==
                                     body_id]['keep_body'].all():
                    logger.error(
                        f"You explicitly listed body {body_id} in subset-bodies, "
                        "but it will be excluded due to your other config settings."
                    )
            full_stats_df['keep_body'] &= full_stats_df.eval(
                'body in @sparse_body_ids')

        # Sort for convenience of viewing output
        with Timer("Sorting segment stats", logger):
            full_stats_df.sort_values(
                ['body_voxel_count', 'segment_voxel_count'],
                ascending=False,
                inplace=True)

        #import pandas as pd
        #pd.set_option('expand_frame_repr', False)
        #logger.info(f"FULL_STATS:\n{full_stats_df}")

        stats_bytes = full_stats_df.memory_usage().sum()
        stats_gb = stats_bytes / 1e9

        # Write the Stats DataFrame to a file for offline analysis.
        output_path = self.config_dir + '/segment-stats-dataframe.pkl.xz'
        logger.info(
            f"Saving segment statistics ({stats_gb:.3f} GB) to {output_path}")
        full_stats_df.to_pickle(output_path)

        return full_stats_df
Esempio n. 17
0
    from DVIDSparkServices.util import Timer

    blocks = np.random.randint(0, 2, size=(10, 64, 64, 64), dtype=bool)
    #blocks = np.ones((10,64,64,64), dtype=bool)

    for block in blocks:
        # Randomly select a fourth of the subblocks to be completely 1,
        # and one fourth to be completely 0
        block_modes = np.random.randint(0, 4, size=(8, 8, 8), dtype=int)
        v = view_as_blocks(block, (8, 8, 8))
        assert is_view_of(v, blocks)

        v &= (block_modes[..., None, None, None] != 0)
        v |= (block_modes[..., None, None, None] == 1)

    with Timer() as enc_timer:
        encoded = encode_mask_blocks(blocks)

    orig_bytes = 64 * 64 * 64 * len(blocks)
    encoded_bytes = len(encoded)
    print(
        f"Size reduction: {orig_bytes} -> {encoded_bytes} ({orig_bytes/encoded_bytes:.1f}x)"
    )

    with Timer() as dec_timer:
        decoded, corners, label = decode_mask_blocks(encoded)

    print(f"Mask encoding seconds: {enc_timer.seconds}")
    print(f"Mask decoding seconds: {dec_timer.seconds}")

    assert (np.array(decoded) == np.array(blocks)).all()