Python persist_and_execute 예제들, DVIDSparkServices.util.persist_and_execute Python 예제들

예제 #1

0

파일 보기

파일: brick.py 프로젝트: janelia-flyem/DVIDSparkServices

def apply_label_mapping(bricks, mapping_pairs):
    """
    Given an RDD of bricks (of label data) and a pre-loaded labelmap in
    mapping_pairs [[orig,new],[orig,new],...],
    apply the mapping to the bricks.
    
    bricks:
        RDD of Bricks containing label volumes
    
    mapping_pairs:
        Mapping as returned by load_labelmap.
        An ndarray of the form:
            [[orig,new],
             [orig,new],
             ... ],
    """
    from dvidutils import LabelMapper
    def remap_bricks(partition_bricks):
        domain, codomain = mapping_pairs.transpose()
        mapper = LabelMapper(domain, codomain)
        
        partition_bricks = list(partition_bricks)
        for brick in partition_bricks:
            # TODO: Apparently LabelMapper can't handle non-contiguous arrays right now.
            #       (It yields incorrect results)
            #       Check to see if this is still a problem in the latest version of xtensor-python.
            brick.volume = np.asarray( brick.volume, order='C' )
            
            mapper.apply_inplace(brick.volume, allow_unmapped=True)
        return partition_bricks
    
    # Use mapPartitions (instead of map) so LabelMapper can be constructed just once per partition
    remapped_bricks = rt.map_partitions( remap_bricks, bricks )
    persist_and_execute(remapped_bricks, f"Remapping bricks", logger)
    return remapped_bricks

예제 #2

0

파일 보기

파일: CreateMeshes.py 프로젝트: janelia-flyem/DVIDSparkServices

    def group_by_body(self, segments_and_meshes):
        config = self.config_data

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":
            def last_six_digits( id_mesh ):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id
            grouped_body_ids_and_meshes = segments_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = self.load_labelmap()

            def prepend_mapped_group_id( id_mesh_partition ):
                df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] )

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append( (group_id, (body_id, mesh)) )
                return new_partition
            
            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = segments_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = segments_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) )

        persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        return grouped_body_ids_and_meshes

예제 #3

0

파일 보기

파일: CreateSkeletons.py 프로젝트: janelia-flyem/DVIDSparkServices

    def _execute_skeletonization(self, large_id_box_mask_factor_err):
        config = self.config_data
        @self.collect_log(lambda _: '_SKELETONIZATION_ERRORS')
        def logged_skeletonize(arg):
            return skeletonize_in_subprocess(config, arg)
        
        #     --> (body_id, swc_contents, error_msg)
        body_ids_and_skeletons = large_id_box_mask_factor_err.map( logged_skeletonize )
        persist_and_execute(body_ids_and_skeletons, "Computing skeletons", logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log. 
        errors = body_ids_and_skeletons.map(lambda id_swc_err: id_swc_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Write
        with Timer() as timer:
            body_ids_and_skeletons.foreachPartition( partial(post_swcs_to_dvid, config) )
        logger.info(f"Writing skeletons to DVID took {timer.seconds}")

예제 #4

0

파일 보기

파일: CreateSkeletons.py 프로젝트: janelia-flyem/flyemflows

    def _execute_skeletonization(self, large_id_box_mask_factor_err):
        config = self.config_data

        @self.collect_log(lambda _: '_SKELETONIZATION_ERRORS')
        def logged_skeletonize(arg):
            return skeletonize_in_subprocess(config, arg)

        #     --> (body_id, swc_contents, error_msg)
        body_ids_and_skeletons = large_id_box_mask_factor_err.map(
            logged_skeletonize)
        persist_and_execute(body_ids_and_skeletons, "Computing skeletons",
                            logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log.
        errors = body_ids_and_skeletons.map(
            lambda id_swc_err: id_swc_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Write
        with Timer() as timer:
            body_ids_and_skeletons.foreachPartition(
                partial(post_swcs_to_dvid, config))
        logger.info(f"Writing skeletons to DVID took {timer.seconds}")

예제 #5

0

파일 보기

파일: CreateMeshes.py 프로젝트: janelia-flyem/flyemflows

    def execute(self):
        import pandas as pd
        self._sanitize_config()

        config = self.config_data
        options = config["options"]

        resource_mgr_client = ResourceManagerClient(options["resource-server"],
                                                    options["resource-port"])
        volume_service = VolumeService.create_from_config(
            config["dvid-info"], self.config_dir, resource_mgr_client)

        self._init_meshes_instances()

        # Aim for 2 GB RDD partitions
        GB = 2**30
        target_partition_size_voxels = 2 * GB // np.uint64().nbytes

        # This will return None if we're not using sparse blocks
        sparse_block_mask = self._get_sparse_block_mask(volume_service)

        brick_wall = BrickWall.from_volume_service(
            volume_service, 0, None, self.sc, target_partition_size_voxels,
            sparse_block_mask)
        brick_wall.persist_and_execute("Downloading segmentation", logger)

        # brick -> [ (segment_label, (box, mask, count)),
        #            (segment_label, (box, mask, count)), ... ]
        segments_and_masks = brick_wall.bricks.map(
            partial(compute_segment_masks, config))
        persist_and_execute(segments_and_masks,
                            "Computing brick-local segment masks", logger)
        brick_wall.unpersist()
        del brick_wall

        with Timer("Computing segment statistics", logger):
            mask_stats_df = self.compute_mask_stats(segments_and_masks)

        # Flatten now, AFTER stats have been computed
        # (compute_mask_stats() requires that the RDDs not have duplicate labels in them.)
        # While we're at it, drop the count (not needed any more)
        # --> (segment_label, (box, mask))
        def drop_count(items):
            new_items = []
            for item in items:
                segment_label, (box, mask, _count) = item
                new_items.append((segment_label, (box, mask)))
            return new_items

        segments_and_masks = segments_and_masks.flatMap(drop_count)

        bad_segments = mask_stats_df[[
            'segment', 'compressed_bytes'
        ]].query('compressed_bytes > 1.9e9')['segment']
        if len(bad_segments) > 0:
            logger.error(
                f"SOME SEGMENTS (N={len(bad_segments)}) ARE TOO BIG TO PROCESS.  Skipping segments: {list(bad_segments)}."
            )
            segments_and_masks = segments_and_masks.filter(
                lambda seg_mask: seg_mask[0] not in bad_segments.values)

        # (segment, (box, mask))
        #   --> (segment, boxes_and_masks)
        #   === (segment, [(box, mask), (box, mask), (box, mask), ...])
        masks_by_segment_id = segments_and_masks.groupByKey()
        persist_and_execute(masks_by_segment_id,
                            "Grouping segment masks by segment label ID",
                            logger)
        segments_and_masks.unpersist()
        del segments_and_masks

        # Insert chosen downsample_factor (a.k.a. dsf)
        #   --> (segment, dsf_and_boxes_and_masks)
        #   === (segment, (downsample_factor, [(box, mask), (box, mask), (box, mask), ...]))
        downsample_df = pd.Series(
            mask_stats_df['downsample_factor'].
            values,  # Must use '.values' here, otherwise
            index=mask_stats_df['segment'].values
        )  # index is used to read initial data.

        def insert_dsf(item):
            segment, boxes_and_masks = item
            downsample_factor = downsample_df[segment]
            return (segment, (downsample_factor, boxes_and_masks))

        masks_by_segment_id = masks_by_segment_id.map(insert_dsf)

        ##
        ## Filter out small segments and/or small bodies
        ##
        keep_col = mask_stats_df['keep_segment'] & mask_stats_df['keep_body']
        if not keep_col.all():
            # Note: This array will be broadcasted to the workers.
            #       It will be potentially quite large if we're keeping most (but not all) segments.
            #       Broadcast expense should be minimal thanks to lz4 compression,
            #       but RAM usage will be high.
            segments_to_keep = mask_stats_df['segment'][keep_col].values
            filtered_masks_by_segment_id = masks_by_segment_id.filter(
                lambda key_and_value: key_and_value[0] in segments_to_keep)
            persist_and_execute(filtered_masks_by_segment_id,
                                "Filtering masks by segment and size", logger)
            del masks_by_segment_id
            masks_by_segment_id = filtered_masks_by_segment_id

        # Aggregate
        # --> (segment_label, (box, mask, downsample_factor))
        segment_box_mask_factor = masks_by_segment_id.mapValues(
            partial(combine_masks, config))
        persist_and_execute(segment_box_mask_factor, "Assembling masks",
                            logger)

        #
        # Re-compute meshes once for every simplification ratio in the config
        #
        for instance_name, simplification_ratio in zip(
                self.mesh_instances, config["mesh-config"]["simplify-ratios"]):

            def _generate_mesh(box_mask_factor):
                box, mask, factor = box_mask_factor
                return generate_mesh(config, simplification_ratio, box, mask,
                                     factor)

            # --> (segment_label, (mesh_bytes, vertex_count))
            segments_meshes_counts = segment_box_mask_factor.mapValues(
                _generate_mesh)
            persist_and_execute(
                segments_meshes_counts,
                f"Computing meshes at decimation {simplification_ratio:.2f}",
                logger)

            with Timer("Computing mesh statistics", logger):
                mask_and_mesh_stats_df = self.append_mesh_stats(
                    mask_stats_df, segments_meshes_counts,
                    f'{simplification_ratio:.2f}')

            # Update the 'keep_body' column: Skip meshes that are too big.
            huge_bodies = (mask_and_mesh_stats_df['body_mesh_bytes'] > 1.9e9)
            if huge_bodies.any():
                logger.error(
                    "SOME BODY MESH GROUPS ARE TOO BIG TO PROCESS.  See dumped DataFrame for details."
                )
                mask_and_mesh_stats_df['keep_body'] &= ~huge_bodies

                # Drop them from the processing list
                segments_in_huge_bodies = mask_and_mesh_stats_df['segment'][
                    huge_bodies].values
                segments_meshes_counts = segments_meshes_counts.filter(
                    lambda seg_and_values: not (seg_and_values[0] in
                                                segments_in_huge_bodies))

            # --> (segment_label, mesh_bytes)
            def drop_vcount(item):
                segment_label, (mesh_bytes, _vertex_count) = item
                return (segment_label, mesh_bytes)

            segments_and_meshes = segments_meshes_counts.map(drop_vcount)

            # Group by body ID
            # --> ( body_id ( segment_label, mesh_bytes ) )
            grouped_body_ids_segments_meshes = self.group_by_body(
                segments_and_meshes)
            unpersist(segments_and_meshes)
            del segments_and_meshes

            unpersist(segments_meshes_counts)
            del segments_meshes_counts

            with Timer("Writing meshes to DVID", logger):
                grouped_body_ids_segments_meshes.foreachPartition(
                    partial(post_meshes_to_dvid, config, instance_name))

            unpersist(grouped_body_ids_segments_meshes)
            del grouped_body_ids_segments_meshes

예제 #6

0

파일 보기

파일: CreateMeshes.py 프로젝트: janelia-flyem/DVIDSparkServices

    def execute(self):
        import pandas as pd
        self._sanitize_config()

        config = self.config_data
        options = config["options"]
        
        resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"])
        volume_service = VolumeService.create_from_config(config["dvid-info"], self.config_dir, resource_mgr_client)

        self._init_meshes_instances()

        # Aim for 2 GB RDD partitions
        GB = 2**30
        target_partition_size_voxels = 2 * GB // np.uint64().nbytes
        
        # This will return None if we're not using sparse blocks
        sparse_block_mask = self._get_sparse_block_mask(volume_service)
        
        brick_wall = BrickWall.from_volume_service(volume_service, 0, None, self.sc, target_partition_size_voxels, sparse_block_mask)
        brick_wall.persist_and_execute("Downloading segmentation", logger)

        # brick -> [ (segment_label, (box, mask, count)),
        #            (segment_label, (box, mask, count)), ... ]
        segments_and_masks = brick_wall.bricks.map( partial(compute_segment_masks, config) )
        persist_and_execute(segments_and_masks, "Computing brick-local segment masks", logger)
        brick_wall.unpersist()
        del brick_wall

        with Timer("Computing segment statistics", logger):
            mask_stats_df = self.compute_mask_stats(segments_and_masks)

        # Flatten now, AFTER stats have been computed
        # (compute_mask_stats() requires that the RDDs not have duplicate labels in them.)
        # While we're at it, drop the count (not needed any more)
        # --> (segment_label, (box, mask))
        def drop_count(items):
            new_items = []
            for item in items:
                segment_label, (box, mask, _count) = item
                new_items.append( (segment_label, (box, mask)) )
            return new_items
        segments_and_masks = segments_and_masks.flatMap( drop_count )

        bad_segments = mask_stats_df[['segment', 'compressed_bytes']].query('compressed_bytes > 1.9e9')['segment']
        if len(bad_segments) > 0:
            logger.error(f"SOME SEGMENTS (N={len(bad_segments)}) ARE TOO BIG TO PROCESS.  Skipping segments: {list(bad_segments)}.")
            segments_and_masks = segments_and_masks.filter( lambda seg_mask: seg_mask[0] not in bad_segments.values )
        
        # (segment, (box, mask))
        #   --> (segment, boxes_and_masks)
        #   === (segment, [(box, mask), (box, mask), (box, mask), ...])
        masks_by_segment_id = segments_and_masks.groupByKey()
        persist_and_execute(masks_by_segment_id, "Grouping segment masks by segment label ID", logger)
        segments_and_masks.unpersist()
        del segments_and_masks

        # Insert chosen downsample_factor (a.k.a. dsf)
        #   --> (segment, dsf_and_boxes_and_masks)
        #   === (segment, (downsample_factor, [(box, mask), (box, mask), (box, mask), ...]))
        downsample_df = pd.Series( mask_stats_df['downsample_factor'].values, # Must use '.values' here, otherwise
                                   index=mask_stats_df['segment'].values )    # index is used to read initial data.
        def insert_dsf(item):
            segment, boxes_and_masks = item
            downsample_factor = downsample_df[segment]
            return (segment, (downsample_factor, boxes_and_masks))
        masks_by_segment_id = masks_by_segment_id.map( insert_dsf )

        ##
        ## Filter out small segments and/or small bodies
        ##
        keep_col = mask_stats_df['keep_segment'] & mask_stats_df['keep_body']
        if not keep_col.all():
            # Note: This array will be broadcasted to the workers.
            #       It will be potentially quite large if we're keeping most (but not all) segments.
            #       Broadcast expense should be minimal thanks to lz4 compression,
            #       but RAM usage will be high.
            segments_to_keep = mask_stats_df['segment'][keep_col].values
            filtered_masks_by_segment_id = masks_by_segment_id.filter( lambda key_and_value: key_and_value[0] in segments_to_keep )
            persist_and_execute(filtered_masks_by_segment_id, "Filtering masks by segment and size", logger)
            del masks_by_segment_id
            masks_by_segment_id = filtered_masks_by_segment_id

        # Aggregate
        # --> (segment_label, (box, mask, downsample_factor))
        segment_box_mask_factor = masks_by_segment_id.mapValues( partial(combine_masks, config) )
        persist_and_execute(segment_box_mask_factor, "Assembling masks", logger)

        #
        # Re-compute meshes once for every simplification ratio in the config
        #
        for instance_name, simplification_ratio in zip(self.mesh_instances, config["mesh-config"]["simplify-ratios"]):
            def _generate_mesh(box_mask_factor):
                box, mask, factor = box_mask_factor
                return generate_mesh(config, simplification_ratio, box, mask, factor)
    
            # --> (segment_label, (mesh_bytes, vertex_count))
            segments_meshes_counts = segment_box_mask_factor.mapValues( _generate_mesh )
            persist_and_execute(segments_meshes_counts, f"Computing meshes at decimation {simplification_ratio:.2f}", logger)
    
            with Timer("Computing mesh statistics", logger):
                mask_and_mesh_stats_df = self.append_mesh_stats( mask_stats_df, segments_meshes_counts, f'{simplification_ratio:.2f}' )
    
            # Update the 'keep_body' column: Skip meshes that are too big.
            huge_bodies = (mask_and_mesh_stats_df['body_mesh_bytes'] > 1.9e9)
            if huge_bodies.any():
                logger.error("SOME BODY MESH GROUPS ARE TOO BIG TO PROCESS.  See dumped DataFrame for details.")
                mask_and_mesh_stats_df['keep_body'] &= ~huge_bodies
    
                # Drop them from the processing list
                segments_in_huge_bodies = mask_and_mesh_stats_df['segment'][huge_bodies].values
                segments_meshes_counts = segments_meshes_counts.filter(lambda seg_and_values: not (seg_and_values[0] in segments_in_huge_bodies))
    
            # --> (segment_label, mesh_bytes)
            def drop_vcount(item):
                segment_label, (mesh_bytes, _vertex_count) = item
                return (segment_label, mesh_bytes)
            segments_and_meshes = segments_meshes_counts.map(drop_vcount)
    
            # Group by body ID
            # --> ( body_id ( segment_label, mesh_bytes ) )
            grouped_body_ids_segments_meshes = self.group_by_body(segments_and_meshes)
            unpersist(segments_and_meshes)
            del segments_and_meshes
    
            unpersist(segments_meshes_counts)
            del segments_meshes_counts

            with Timer("Writing meshes to DVID", logger):
                grouped_body_ids_segments_meshes.foreachPartition( partial(post_meshes_to_dvid, config, instance_name) )
            
            unpersist(grouped_body_ids_segments_meshes)
            del grouped_body_ids_segments_meshes

예제 #7

0

파일 보기

파일: CreateSkeletons.py 프로젝트: janelia-flyem/DVIDSparkServices

    def _execute_mesh_generation(self, large_id_box_mask_factor_err):
        config = self.config_data
        @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS')
        def logged_generate_mesh(arg):
            return generate_mesh_in_subprocess(config, arg)
        
        #     --> (body_id, mesh_bytes, error_msg)
        body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map( logged_generate_mesh )
        persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes", logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log. 
        errors = body_ids_and_meshes_with_err.map(lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Filter out error cases
        body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \
                                                          .map( lambda id_mesh_err: id_mesh_err[:2] )
                                                          
        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":
            def last_six_digits( id_mesh ):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id
            grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = load_labelmap( config["mesh-config"]["storage"]["labelmap"], self.config_dir )

            def prepend_mapped_group_id( id_mesh_partition ):
                df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] )

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append( (group_id, (body_id, mesh)) )
                return new_partition
            
            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = body_ids_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) )

        persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        unpersist(body_ids_and_meshes)
        del body_ids_and_meshes
        
        with Timer() as timer:
            grouped_body_ids_and_meshes.foreachPartition( partial(post_meshes_to_dvid, config) )
        logger.info(f"Writing meshes to DVID took {timer.seconds}")

예제 #8

0

파일 보기

파일: CreateSkeletons.py 프로젝트: janelia-flyem/DVIDSparkServices

    def execute(self):
        self._sanitize_config()

        config = self.config_data
        options = config["options"]
        
        resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"])
        volume_service = VolumeService.create_from_config(config["dvid-info"], self.config_dir, resource_mgr_client)

        self._init_skeletons_instance()

        # Aim for 2 GB RDD partitions
        GB = 2**30
        target_partition_size_voxels = 2 * GB // np.uint64().nbytes

        brick_wall = BrickWall.from_volume_service(volume_service, 0, None, self.sc, target_partition_size_voxels)
        brick_wall.persist_and_execute("Downloading segmentation", logger)
        
        # brick -> (body_id, (box, mask, count))
        body_ids_and_masks = brick_wall.bricks.flatMap( partial(body_masks, config) )
        persist_and_execute(body_ids_and_masks, "Computing brick-local masks", logger)
        brick_wall.unpersist()
        del brick_wall

        # In the case of catastrophic merges, some bodies may be too big to handle.
        # Skeletonizing them would probably time out anyway.
        bad_bodies = self.list_unmanageable_bodies(body_ids_and_masks)
        body_ids_and_masks = body_ids_and_masks.filter(lambda k_v: k_v[0] not in bad_bodies)

        # (body_id, (box, mask, count))
        #   --> (body_id, [(box, mask, count), (box, mask, count), (box, mask, count), ...])
        grouped_body_ids_and_masks = body_ids_and_masks.groupByKey()
        persist_and_execute(grouped_body_ids_and_masks, "Grouping masks by body id", logger)
        body_ids_and_masks.unpersist()
        del body_ids_and_masks

        # (Same RDD contents, but without small bodies)
        grouped_large_body_ids_and_masks = grouped_body_ids_and_masks.filter( partial(is_combined_object_large_enough, config) )
        persist_and_execute(grouped_large_body_ids_and_masks, "Filtering masks by size", logger)
        grouped_body_ids_and_masks.unpersist()
        del grouped_body_ids_and_masks

        @self.collect_log(lambda _: '_AGGREGATION_ERRORS')
        def logged_combine(arg):
            return combine_masks_in_subprocess(config, arg)

        #  --> (body_id, combined_box, mask, downsample_factor)
        id_box_mask_factor_err = grouped_large_body_ids_and_masks.map( logged_combine )
        persist_and_execute(id_box_mask_factor_err, "Downsampling and aggregating masks", logger)
        grouped_large_body_ids_and_masks.unpersist()
        del grouped_large_body_ids_and_masks

        # Errors were already written to a separate file, but let's duplicate them in the master log. 
        errors = id_box_mask_factor_err.map(lambda i_b_m_f_e: i_b_m_f_e[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Small bodies (or those with errors) were not processed,
        # and 'None' was returned instead of a mask. Remove them.
        def mask_is_not_none(i_b_m_f_e):
            _body_id, _combined_box, combined_mask, _downsample_factor, _error_msg = i_b_m_f_e
            return combined_mask is not None

        large_id_box_mask_factor_err = id_box_mask_factor_err.filter( mask_is_not_none )

        if "neutube-skeleton" in config["options"]["output-types"]:
            self._execute_skeletonization(large_id_box_mask_factor_err)

        if "mesh" in config["options"]["output-types"]:
            self._execute_mesh_generation(large_id_box_mask_factor_err)

예제 #9

0

파일 보기

파일: CreateSkeletons.py 프로젝트: janelia-flyem/flyemflows

    def _execute_mesh_generation(self, large_id_box_mask_factor_err):
        config = self.config_data

        @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS')
        def logged_generate_mesh(arg):
            return generate_mesh_in_subprocess(config, arg)

        #     --> (body_id, mesh_bytes, error_msg)
        body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map(
            logged_generate_mesh)
        persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes",
                            logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log.
        errors = body_ids_and_meshes_with_err.map(
            lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Filter out error cases
        body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \
                                                          .map( lambda id_mesh_err: id_mesh_err[:2] )

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":

            def last_six_digits(id_mesh):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id

            grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(
                last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = load_labelmap(
                config["mesh-config"]["storage"]["labelmap"], self.config_dir)

            def prepend_mapped_group_id(id_mesh_partition):
                df = pd.DataFrame(mapping_pairs,
                                  columns=["body_id", "group_id"])

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append((group_id, (body_id, mesh)))
                return new_partition

            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = body_ids_and_meshes.map(
                lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]))

        persist_and_execute(
            grouped_body_ids_and_meshes,
            f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        unpersist(body_ids_and_meshes)
        del body_ids_and_meshes

        with Timer() as timer:
            grouped_body_ids_and_meshes.foreachPartition(
                partial(post_meshes_to_dvid, config))
        logger.info(f"Writing meshes to DVID took {timer.seconds}")

예제 #10

0

파일 보기

파일: CreateSkeletons.py 프로젝트: janelia-flyem/flyemflows

    def execute(self):
        self._sanitize_config()

        config = self.config_data
        options = config["options"]

        resource_mgr_client = ResourceManagerClient(options["resource-server"],
                                                    options["resource-port"])
        volume_service = VolumeService.create_from_config(
            config["dvid-info"], self.config_dir, resource_mgr_client)

        self._init_skeletons_instance()

        # Aim for 2 GB RDD partitions
        GB = 2**30
        target_partition_size_voxels = 2 * GB // np.uint64().nbytes

        brick_wall = BrickWall.from_volume_service(
            volume_service, 0, None, self.sc, target_partition_size_voxels)
        brick_wall.persist_and_execute("Downloading segmentation", logger)

        # brick -> (body_id, (box, mask, count))
        body_ids_and_masks = brick_wall.bricks.flatMap(
            partial(body_masks, config))
        persist_and_execute(body_ids_and_masks, "Computing brick-local masks",
                            logger)
        brick_wall.unpersist()
        del brick_wall

        # In the case of catastrophic merges, some bodies may be too big to handle.
        # Skeletonizing them would probably time out anyway.
        bad_bodies = self.list_unmanageable_bodies(body_ids_and_masks)
        body_ids_and_masks = body_ids_and_masks.filter(
            lambda k_v: k_v[0] not in bad_bodies)

        # (body_id, (box, mask, count))
        #   --> (body_id, [(box, mask, count), (box, mask, count), (box, mask, count), ...])
        grouped_body_ids_and_masks = body_ids_and_masks.groupByKey()
        persist_and_execute(grouped_body_ids_and_masks,
                            "Grouping masks by body id", logger)
        body_ids_and_masks.unpersist()
        del body_ids_and_masks

        # (Same RDD contents, but without small bodies)
        grouped_large_body_ids_and_masks = grouped_body_ids_and_masks.filter(
            partial(is_combined_object_large_enough, config))
        persist_and_execute(grouped_large_body_ids_and_masks,
                            "Filtering masks by size", logger)
        grouped_body_ids_and_masks.unpersist()
        del grouped_body_ids_and_masks

        @self.collect_log(lambda _: '_AGGREGATION_ERRORS')
        def logged_combine(arg):
            return combine_masks_in_subprocess(config, arg)

        #  --> (body_id, combined_box, mask, downsample_factor)
        id_box_mask_factor_err = grouped_large_body_ids_and_masks.map(
            logged_combine)
        persist_and_execute(id_box_mask_factor_err,
                            "Downsampling and aggregating masks", logger)
        grouped_large_body_ids_and_masks.unpersist()
        del grouped_large_body_ids_and_masks

        # Errors were already written to a separate file, but let's duplicate them in the master log.
        errors = id_box_mask_factor_err.map(
            lambda i_b_m_f_e: i_b_m_f_e[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Small bodies (or those with errors) were not processed,
        # and 'None' was returned instead of a mask. Remove them.
        def mask_is_not_none(i_b_m_f_e):
            _body_id, _combined_box, combined_mask, _downsample_factor, _error_msg = i_b_m_f_e
            return combined_mask is not None

        large_id_box_mask_factor_err = id_box_mask_factor_err.filter(
            mask_is_not_none)

        if "neutube-skeleton" in config["options"]["output-types"]:
            self._execute_skeletonization(large_id_box_mask_factor_err)

        if "mesh" in config["options"]["output-types"]:
            self._execute_mesh_generation(large_id_box_mask_factor_err)

예제 #11

0

파일 보기

    def execute(self):
        self._sanitize_config()

        # hard coding block size that is 64
        # (TODO: make dynamic)
        BLKSIZE = 64
        
        input_config = self.config_data["input"]
        options = self.config_data["options"]

        input_bb_zyx = np.array(input_config["geometry"]["bounding-box"])[:,::-1]
        input_bricks, bounding_box, _input_grid = self._partition_input()


        persist_and_execute(input_bricks, f"Reading entire volume", logger)

        # find the blocks for each body 
        def extractBodyBlockIds(brick):
            """Determined blocks that intersect each body.

            FlatMap: brick -> (bodyid, [blockids])
            """
            vol = brick.volume
            offsetzyx = brick.physical_box[0]

            zsz, ysz, xsz = vol.shape
            assert zsz == BLKSIZE
            assert ysz == BLKSIZE
            assert xsz % BLKSIZE == 0

            zid = offsetzyx[0] // BLKSIZE
            yid = offsetzyx[1] // BLKSIZE
            xid = offsetzyx[2] // BLKSIZE

            bodymappings = {}

            for blockspot in range(0, xsz, BLKSIZE):
                bodyids = np.unique(vol[:,:,blockspot:(blockspot+BLKSIZE)])
                
                for bodyid in bodyids:
                    if bodyid == 0:
                        # ignore background bodies
                        continue
                    if bodyid not in bodymappings:
                        bodymappings[bodyid] = []
                    bodymappings[bodyid].append((zid, yid, xid))
                xid += 1

            res = []
            for bodyid, mappings in bodymappings.items():
                res.append((bodyid, mappings))
            return res

        allbodies = input_bricks.flatMap(extractBodyBlockIds)
        del input_bricks

        # combine body information across RDD 
        def combineBodyInfo(part1, part2):
            part1.extend(part2)
            return part1
        allbodies = allbodies.reduceByKey(combineBodyInfo)
        allbodies.persist()
        
        # get global list
        globalbodylist = allbodies.map(lambda x: x[0]).collect()
        globalbodylist.sort()
        
        # group sorted bodies
        BODYLIMIT = 1000
        def findorder(bodyblocks):
            body, blocks = bodyblocks
            index = globalbodylist.index(body) // BODYLIMIT
            return (index, [(body, blocks)])
        allbodies_index = allbodies.map(findorder)

        def orderbodies(b1, b2):
            b1.extend(b2)
            return b1
        allbodies_sorted = allbodies_index.reduceByKey(orderbodies)
    
        # TODO extract indices in separate step to measure fetch time
        # fetch indices for provided block and produce list of [body, bad ids]
        server = input_config["dvid"]["server"]
        uuid = input_config["dvid"]["uuid"]
        resource_server = self.resource_server
        resource_port = self.resource_port
        labelname = input_config["dvid"]["segmentation-name"]
        appname = self.APPNAME

        def findindexerrors(bodies):
            index, bodylist = bodies
            bodymappings = {}
            rangequery = []
            for (body, bids) in bodylist:
                bodymappings[body] = bids
                rangequery.append(body)
            
            # call block index DVID API
            from libdvid import ConnectionMethod
            rangequery.sort()
            b1 = rangequery[0]
            b2 = rangequery[-1]
    
            ns = retrieve_node_service(server, uuid, resource_server, resource_port, appname)

            addr = str(labelname + "/sparsevols-coarse/" + str(b1) + "/" + str(b2))
            res = ns.custom_request(addr, None, ConnectionMethod.GET)
        
            bodyblockrle = np.fromstring(res, dtype=np.int32)
            currindex = 0
            
            bodymappingsdvid = {}
            while currindex < len(bodyblockrle):
                #  retrieve bodies
                hb = bodyblockrle[currindex]
                lb = bodyblockrle[currindex+1]
                currbody = hb | lb << 32 
                currindex += 2
                
                # retrieve runlengths
                numspans = bodyblockrle[currindex] 
                currindex += 1
                blockarray = []
                for index in range(numspans):
                    dimx = bodyblockrle[currindex] 
                    currindex += 1
                    dimy = bodyblockrle[currindex] 
                    currindex += 1
                    dimz = bodyblockrle[currindex] 
                    currindex += 1
                    runx = bodyblockrle[currindex] 
                    currindex += 1

                    # create body mappings
                    for xblock in range(dimx, dimx+runx):
                        blockarray.append((dimz, dimy, xblock))
                bodymappingsdvid[currbody] = blockarray

            allerrors = []
            # find differences
            for body, blocklist in bodymappings.items():
                if body not in bodymappingsdvid:
                    allerrors.append([True, body, blocklist])
                    continue

                # false negatives
                bset = set(blocklist)
                bsetdvid = set(bodymappingsdvid[body])
                errors = list(bset - bsetdvid)
                if len(errors) > 0:
                    allerrors.append([True, body, errors])
                
                # false positives
                errors2 = list(bsetdvid - bset)
                if len(errors2) > 0:
                    allerrors.append([False, body, errors2])
            return allerrors

        badindices = allbodies_sorted.flatMap(findindexerrors)

        # report errors
        allerrors = badindices.collect()
        
        # TODO provide link locations for bad bodies
        #self._log_neuroglancer_links()

        errorjson = []
        for bodyerror in allerrors:
            errorjson.append(bodyerror)

        fout = open(self.config_data["output"], 'w')
        fout.write(json.dumps(errorjson, indent=2, cls=NumpyConvertingEncoder))

        logger.info(f"DONE analyzing segmentation.")