Python groupby_presorted Beispiele, neuclease.util.groupby_presorted Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: ingest_label_indexes.py Projekt: janelia-flyem/DVIDSparkServices

 def _agg(a, sorted_cols, groups, agg_results):
     pos = 0
     for i, group_rows in enumerate(groupby_presorted(a, sorted_cols)):
         groups[i] = sorted_cols[pos]
         pos += len(group_rows)
         agg_results[i] = group_rows.sum(0) # axis 0
     return (groups, agg_results)

Beispiel #2

0

Datei anzeigen

 def _agg(a, sorted_cols, groups, agg_results):
     pos = 0
     for i, group_rows in enumerate(groupby_presorted(a, sorted_cols)):
         groups[i] = sorted_cols[pos]
         pos += len(group_rows)
         agg_results[i] = group_rows.sum(0) # axis 0
     return (groups, agg_results)

Beispiel #3

0

Datei anzeigen

Datei: ingest_label_indexes.py Projekt: aplbrain/flyemflows

    def label_indexes_for_body(self, body_group_span):
        """
        Load body_group (a subarray with dtype=STATS_DTYPE
        and a only single unique body_id) into a LabelIndex protobuf structure.
        """
        label_indexes = []

        body_group_start, body_group_stop = body_group_span
        body_group = self.block_sv_stats[body_group_start:body_group_stop]
        body_id = body_group[0]['body_id']

        if (self.subset_labels is not None) and (body_id
                                                 not in self.subset_labels):
            return []

        if self.tombstone_mode != 'only':
            label_index = LabelIndex()
            label_index.label = body_id
            label_index.last_mutid = self.last_mutid
            label_index.last_mod_user = self.user
            label_index.last_mod_time = self.mod_time

            body_dtype = STATS_DTYPE[0]
            segment_dtype = STATS_DTYPE[1]
            coords_dtype = ('coord_cols', STATS_DTYPE[2:5])
            count_dtype = STATS_DTYPE[5]
            assert body_dtype[0] == 'body_id'
            assert segment_dtype[0] == 'segment_id'
            assert np.dtype(coords_dtype[1]).names == ('z', 'y', 'x')
            assert count_dtype[0] == 'count'

            body_group = body_group.view(
                [body_dtype, segment_dtype, coords_dtype, count_dtype])
            coord_cols = body_group['coord_cols'].view(
                (np.int32, 3)).reshape(-1, 3)
            for block_group in groupby_presorted(body_group, coord_cols):
                coord = block_group['coord_cols'][0]
                encoded_block_id = _encode_block_id(coord)
                label_index.blocks[encoded_block_id].counts.update(
                    zip(block_group['segment_id'], block_group['count']))

            label_indexes.append(label_index)

        if self.tombstone_mode in ('include', 'only'):
            # All segments in this body should no longer get a real index
            # (except for the segment that matches the body_id itself).
            # We'll send an empty LabelIndex (a 'tombstone') for each one.
            all_segments = np.unique(body_group['segment_id'])
            tombstone_segments = all_segments[all_segments != body_id]
            for segment_id in tombstone_segments:
                tombstone_index = LabelIndex()
                tombstone_index.label = segment_id
                tombstone_index.last_mutid = self.last_mutid
                tombstone_index.last_mod_user = self.user
                tombstone_index.last_mod_time = self.mod_time
                label_indexes.append(tombstone_index)

        return label_indexes

Beispiel #4

0

Datei anzeigen

Datei: volume_service.py Projekt: janelia-flyem/flyemflows

    def sample_labels(self, points_zyx, scale=0, npartitions=1024):
        """
        Read the label under each of the given points.
        """
        if isinstance(points_zyx, pd.DataFrame):
            assert not ({*'zyx'} - {*points_zyx.columns}), \
                "points must have columns 'z', 'y', 'x', your dataframe had: {points_zyx.columns.tolist()}"
            points_zyx = points_zyx[[*'zyx']].values
        else:
            points_zyx = np.asarray(points_zyx)

        assert points_zyx.shape[1] == 3

        brick_shape = self.preferred_message_shape // (2**scale)
        idx = np.arange(len(points_zyx))[:, None]

        # columns: [bz, by, bx, z, y, x, i]
        brick_ids_and_points = np.concatenate(
            (points_zyx // brick_shape, points_zyx, idx), axis=1)
        brick_ids_and_points = lexsort_columns(brick_ids_and_points)

        # extract columns brick_ids, zyxi
        brick_ids = brick_ids_and_points[:, :3]
        sorted_points = brick_ids_and_points[:, 3:]

        # This is faster than pandas.DataFrame.groupby() for large data
        point_groups = [*groupby_presorted(sorted_points, brick_ids)]
        num_groups = len(point_groups)
        logger.info(
            f"Sampling labels for {len(points_zyx)} points from {num_groups} bricks"
        )

        def sample_labels_from_brick(points_zyxi):
            points_zyx = points_zyxi[:, :3]
            box = (points_zyx.min(axis=0), 1 + points_zyx.max(axis=0))
            vol = self.get_subvolume(box, scale)
            localpoints = points_zyx - box[0]
            labels = vol[(*localpoints.transpose(), )]
            df = pd.DataFrame(points_zyxi, columns=[*'zyxi'])
            df['label'] = labels
            return df

        import dask.bag as db

        point_groups = db.from_sequence(point_groups, npartitions=npartitions)
        label_dfs = point_groups.map(sample_labels_from_brick).compute()
        label_df = pd.concat(label_dfs, ignore_index=True)

        # Return in the same order the user passed in
        label_df = label_df.sort_values('i')
        return label_df["label"].values

Beispiel #5

0

Datei anzeigen

Datei: ingest_label_indexes.py Projekt: janelia-flyem/DVIDSparkServices

    def label_indexes_for_body(self, body_group_span):
        """
        Load body_group (a subarray with dtype=STATS_DTYPE
        and a only single unique body_id) into a LabelIndex protobuf structure.
        """
        label_indexes = []
        
        body_group_start, body_group_stop = body_group_span
        body_group = self.block_sv_stats[body_group_start:body_group_stop]
        body_id = body_group[0]['body_id']

        if self.tombstone_mode != 'only':
            label_index = LabelIndex()
            label_index.label = body_id
            label_index.last_mutid = self.last_mutid
            label_index.last_mod_user = self.user
            label_index.last_mod_time = self.mod_time
            
            body_dtype = STATS_DTYPE[0]
            segment_dtype = STATS_DTYPE[1]
            coords_dtype = ('coord_cols', STATS_DTYPE[2:5])
            count_dtype = STATS_DTYPE[5]
            assert body_dtype[0] == 'body_id'
            assert segment_dtype[0] == 'segment_id'
            assert np.dtype(coords_dtype[1]).names == ('z', 'y', 'x')
            assert count_dtype[0] == 'count'
            
            body_group = body_group.view([body_dtype, segment_dtype, coords_dtype, count_dtype])
            coord_cols = body_group['coord_cols'].view((np.int32, 3)).reshape(-1, 3)
            for block_group in groupby_presorted(body_group, coord_cols):
                coord = block_group['coord_cols'][0]
                encoded_block_id = _encode_block_id(coord)
                label_index.blocks[encoded_block_id].counts.update( zip(block_group['segment_id'], block_group['count']) )
    
            label_indexes.append(label_index)
        
        if self.tombstone_mode in ('include', 'only'):
            # All segments in this body should no longer get a real index
            # (except for the segment that matches the body_id itself).
            # We'll send an empty LabelIndex (a 'tombstone') for each one.
            all_segments = np.unique(body_group['segment_id'])
            tombstone_segments = all_segments[all_segments != body_id]
            for segment_id in tombstone_segments:
                tombstone_index = LabelIndex()
                tombstone_index.label = segment_id
                tombstone_index.last_mutid = self.last_mutid
                tombstone_index.last_mod_user = self.user
                tombstone_index.last_mod_time = self.mod_time
                label_indexes.append(tombstone_index)

        return label_indexes

Beispiel #6

0

Datei anzeigen

Datei: erase_from_labelindexes.py Projekt: janelia-flyem/flyemflows

    def gen():
        next_stats_batch = []
        next_stats_batch_total_rows = 0

        for batch in groupby_presorted(block_sv_stats,
                                       block_sv_stats['body_id'][:, None]):
            next_stats_batch.append(batch)
            next_stats_batch_total_rows += len(batch)
            if next_stats_batch_total_rows >= batch_rows:
                yield (next_stats_batch, next_stats_batch_total_rows)
                next_stats_batch = []
                next_stats_batch_total_rows = 0

        # last batch
        if next_stats_batch:
            yield (next_stats_batch, next_stats_batch_total_rows)

Beispiel #7

0

Datei anzeigen

 def count_groups():
     num_groups = 0
     for _ in groupby_presorted(a, sorted_cols):
         num_groups += 1
     return num_groups

Beispiel #8

0

Datei anzeigen

Datei: paste_sparsevols.py Projekt: stuarteberg/pydvid

def overwrite_sparsevol(server, uuid, instance, new_label, sparsevol_filepath,
                        roi_sbm, invert_roi, no_downres, logger):
    """
    Given a sparsevol (and an optional ROI mask), download all blocks
    intersecting the sparsevol, and overwrite the supervoxels in each
    block that are covered by the sparsevol (and ROI).
    
    Pseudo-code:
        
        1. Parse the sparsevol RLE data into a complete list of coordinates.
           (Note: For large bodies, this requires a lot of RAM.)
           
           Results in a large array:
           
               [[z,y,x],
                [z,y,x],
                ...
               ]

        2. Append columns to the coordinate array for the block index:
        
               [[z,y,x,bz,by,bx],
                [z,y,x,bz,by,bx],
                ...
               ]

        3. Sort the coordinate array by BLOCK index (bz,by,bx).
        
        4. Divide the coordinates into groups, by block index.
           Now each group of coordinates corresponds to a single
           block that needs to be patched.
           
        5. For each group:
             a. Construct a 3D mask from the coordinates in this group
                AND the intersection of the given ROI mask (if provided).
             b. Download the corresponding labelmap block.
             c. Overwrite the masked voxels with new_label.
             d. Do not post the patched block data immediately.
                Instead, save it to a temporary queue.
             e. If the queue has 400 blocks in it, post them all,
                and then clear the queue.

        6. After the above loop runs, clear the queue one last time.


    Args:
    
        server:
            dvid server, e.g. emdata1:8000

        uuid:
            uuid to read/write.  Must be unlocked.

        instance:
            labelmap instance to read/write blocks
        
        new_label:
            The supervoxel ID to use when overwriting the voxel data.

        sparsevol_filepath:
            path to a binary file containing a sparsevol,
            exactly as downloaded from DVID's /sparsevol endpoint.

        roi_sbm:
            Optional. An ROI mask, loaded into a SparseBlockMask object.
        
        invert_roi:
            If False, only overwrite voxels that overlap the given ROI.
            If True, only overwrite voxels that DON'T overlap the given ROI.
            (If you don't want the ROI to be used at all, set roi_sbm=None.)
        
        no_downres:
            If True, tell DVID not to update the labelmap downscale pyramids in
            response to each block write.  In that case, you're responsible
            for updating the downscale pyramids yourself.
        
        logger:
            A Python logger object for writing misc. status messages.
    
    Returns:
        The set of supervoxels that were at least partially overwritten by the new label.
    """
    sorted_path = sparsevol_filepath + '.sorted_blocktable.npy'
    if os.path.exists(sorted_path):
        logger.info("Loading presorted sparse coordinates")
        sorted_table = np.load(sorted_path)
    else:
        with open(sparsevol_filepath, 'rb') as f:
            with Timer("Parsing sparsevol coordinates", logger):
                coords = parse_rle_response(f.read(), np.int16)
            with Timer("Sorting sparsevol coordiantes", logger):
                table = np.concatenate((coords // 64, coords), axis=1)
                sorted_table = lexsort_columns(table)
                del table

            with Timer("Saving sparsevol sorted blocktable"):
                np.save(sorted_path, sorted_table)

    overwritten_labels = set()

    BLOCK_GROUP_SIZE = 400
    next_block_set = []
    for coord_group in groupby_presorted(sorted_table[:, 3:],
                                         sorted_table[:, :3]):
        block_corner = coord_group[0] // 64 * 64
        block_box = (block_corner, 64 + block_corner)
        block_voxels = fetch_labelarray_voxels(server,
                                               uuid,
                                               instance,
                                               block_box,
                                               supervoxels=True)

        block_mask = np.zeros_like(block_voxels, dtype=bool)
        mask_coords = coord_group - block_corner
        block_mask[tuple(mask_coords.transpose())] = True

        if roi_sbm is not None:
            roi_mask = roi_sbm.get_fullres_mask(block_box)
            if invert_roi:
                roi_mask = ~roi_mask
            block_mask[:] &= roi_mask

        if not block_mask.any():
            continue

        overwritten_labels |= set(pd.unique(block_voxels[block_mask]))
        block_voxels[block_mask] = new_label

        next_block_set.append((block_corner, block_voxels))

        # Flush the blocks periodically
        if len(next_block_set) == BLOCK_GROUP_SIZE:
            with Timer(f"Sending block set (N={len(next_block_set)})", logger):
                post_labelarray_blocks(server,
                                       uuid,
                                       instance,
                                       *zip(*next_block_set),
                                       downres=not no_downres)
            next_block_set = []

    with Timer(f"Sending last block set (N={len(next_block_set)})", logger):
        post_labelarray_blocks(server,
                               uuid,
                               instance,
                               *zip(*next_block_set),
                               downres=not no_downres)

    return overwritten_labels

Beispiel #9

0

Datei anzeigen

    def execute(self):
        self._sanitize_config()

        input_config = self.config["input"]
        options = self.config["samplepoints"]
        resource_config = self.config["resource-manager"]

        resource_mgr_client = ResourceManagerClient(resource_config["server"], resource_config["port"])
        volume_service = VolumeService.create_from_config(input_config, resource_mgr_client)

        input_csv = options["input-table"]
        with Timer(f"Reading {input_csv}", logger):
            coordinate_table_df = pd.read_csv(input_csv, header=0, dtype=CSV_TYPES)
            points = coordinate_table_df[['z', 'y', 'x']].values

        rescale = options["rescale-points-to-level"]
        if rescale != 0:
            points //= (2**rescale)

        # All points must lie within the input volume        
        points_box = [points.min(axis=0), 1+points.max(axis=0)]
        if (box_intersection(points_box, volume_service.bounding_box_zyx) != points_box).all():
            raise RuntimeError("The point list includes points outside of the volume bounding box.")

        with Timer("Sorting points by Brick ID", logger):
            # 'Brick ID' is defined as the divided corner coordinate 
            brick_shape = volume_service.preferred_message_shape
            brick_ids_and_points = np.concatenate( (points // brick_shape, points), axis=1 )
            brick_ids_and_points = lexsort_columns(brick_ids_and_points)

            brick_ids = brick_ids_and_points[: ,:3]
            points = brick_ids_and_points[:, 3:]
            
            # Extract the first row of each group to get the set of unique brick IDs
            point_group_spans = groupby_spans_presorted(brick_ids)
            point_group_starts = (start for start, stop in point_group_spans)
            unique_brick_ids = brick_ids[np.fromiter(point_group_starts, np.int32)]

        with Timer("Constructing sparse mask", logger):
            # BrickWall.from_volume_service() supports the ability to initialize a sparse RDD,
            # with only a subset of Bricks (rather than a dense RDD containing every brick
            # within the volume bounding box).
            # It requires a SparseBlockMask object indicating exactly which Bricks need to be fetched.
            brick_mask_box = np.array([unique_brick_ids.min(axis=0), 1+unique_brick_ids.max(axis=0)])

            brick_mask_shape = (brick_mask_box[1] - brick_mask_box[0])
            brick_mask = np.zeros(brick_mask_shape, bool)
            brick_mask_coords = unique_brick_ids - brick_mask_box[0]
            brick_mask[tuple(brick_mask_coords.transpose())] = True
            sbm = SparseBlockMask(brick_mask, brick_mask_box*brick_shape, brick_shape)

        with Timer("Initializing BrickWall", logger):
            # Aim for 2 GB RDD partitions when loading segmentation
            GB = 2**30
            target_partition_size_voxels = 2 * GB // np.uint64().nbytes
            brickwall = BrickWall.from_volume_service(volume_service, 0, None, self.client, target_partition_size_voxels, 0, sbm, lazy=True)
        
        with Timer(f"Grouping {len(points)} points", logger):
            # This is faster than pandas.DataFrame.groupby() for large data
            point_groups = groupby_presorted(points, brick_ids)
            id_and_ptgroups = list(zip(unique_brick_ids, point_groups))
            num_groups = len(id_and_ptgroups)

        with Timer(f"Join {num_groups} point groups with bricks", logger):
            id_and_ptgroups = dask.bag.from_sequence( id_and_ptgroups,
                                                      npartitions=brickwall.bricks.npartitions )

            id_and_ptgroups = id_and_ptgroups.map(lambda i_p: (*i_p[0], i_p[1]))
            id_and_ptgroups_df = id_and_ptgroups.to_dataframe(columns=['z', 'y', 'x', 'pointgroup'])
            
            ids_and_bricks = brickwall.bricks.map(lambda brick: (*(brick.logical_box[0] // brick_shape), brick))
            ids_and_bricks_df = ids_and_bricks.to_dataframe(columns=['z', 'y', 'x', 'brick'])

            def set_brick_id_index(df):
                def set_brick_id(df):
                    df['brick_id'] = encode_coords_to_uint64( df[['z', 'y', 'x']].values.astype(np.int32) )
                    return df
                df['brick_id'] = np.uint64(0)
                df = df.map_partitions(set_brick_id, meta=df)

                # Note: bricks and pointgroups are already sorted by
                # brick scan-order so, brick_id is already sorted.
                # Specifying sorted=True is critical to performance here.
                df = df.set_index('brick_id', sorted=True)
                return df

            # Give them matching indexes
            ids_and_bricks_df = set_brick_id_index(ids_and_bricks_df)
            id_and_ptgroups_df = set_brick_id_index(id_and_ptgroups_df)

            # Join (index-on-index, so it should be fast)
            ptgroup_and_brick_df = id_and_ptgroups_df.merge( ids_and_bricks_df,
                                                             how='left', left_index=True, right_index=True )
            ptgroup_and_brick_df = ptgroup_and_brick_df[['pointgroup', 'brick']]
            ptgroup_and_brick = ptgroup_and_brick_df.to_bag()
            
        # Persist and force computation before proceeding.
        #ptgroup_and_brick = persist_and_execute(ptgroup_and_brick, "Persisting joined point groups", logger, False)
        #assert ptgroup_and_brick.count().compute() == num_groups == brickwall.num_bricks

        def sample_points(points_and_brick):
            """
            Given a Brick and array of points (N,3) that lie within it,
            sample labels from the points within the brick and return
            a record array containing the points and the sampled labels.
            """
            points, brick = points_and_brick

            result_dtype = [('z', np.int32), ('y', np.int32), ('x', np.int32), ('label', np.uint64)]
            result = np.zeros((len(points),), result_dtype)
            result['z'] = points[:,0]
            result['y'] = points[:,1]
            result['x'] = points[:,2]

            # Make relative to brick offset
            points -= brick.physical_box[0]
            
            result['label'] = brick.volume[tuple(points.transpose())]
            return result

        with Timer("Sampling bricks", logger):
            brick_samples = ptgroup_and_brick.map(sample_points).compute()

        with Timer("Concatenating samples", logger):
            sample_table = np.concatenate(brick_samples)

        with Timer("Sorting samples", logger):
            # This will sort in terms of the SCALED z,y,x coordinates
            sample_table.sort()

        with Timer("Sorting table", logger):
            if rescale == 0:
                coordinate_table_df.sort_values(['z', 'y', 'x'], inplace=True)
            else:
                # sample_table is sorted by RESCALED coordiante,
                # so sort our table the same way
                coordinate_table_df['rz'] = coordinate_table_df['z'] // (2**rescale)
                coordinate_table_df['ry'] = coordinate_table_df['y'] // (2**rescale)
                coordinate_table_df['rx'] = coordinate_table_df['x'] // (2**rescale)
                coordinate_table_df.sort_values(['rz', 'ry', 'rx'], inplace=True)
                del coordinate_table_df['rz']
                del coordinate_table_df['ry']
                del coordinate_table_df['rx']
                
        # Now that samples and input rows are sorted identically,
        # append the results
        output_col = options["output-column"]
        coordinate_table_df[output_col] = sample_table['label'].copy()

        if rescale != 0:
            with Timer("Re-sorting table at scale 0", logger):
                # For simplicity (API and testing), we guarantee that coordinates are sorted in the output.
                # In the case of rescaled points, they need to be sorted once more (at scale 0 this time)
                coordinate_table_df.sort_values(['z', 'y', 'x'], inplace=True)

        with Timer("Exporting samples", logger):
            coordinate_table_df.to_csv(options["output-table"], header=True, index=False)

        logger.info("DONE.")

Beispiel #10

0

Datei anzeigen

Datei: SamplePoints.py Projekt: janelia-flyem/DVIDSparkServices

    def execute(self):
        self._sanitize_config()
        config = self.config_data
        options = config["options"]

        resource_mgr_client = ResourceManagerClient(options["resource-server"], options["resource-port"])
        volume_service = VolumeService.create_from_config(config["input"], self.config_dir, resource_mgr_client)

        input_csv = config["options"]["input-table"]
        with Timer(f"Reading {input_csv}", logger):
            coordinate_table_df = pd.read_csv(input_csv, header=0, dtype=CSV_TYPES)
            points = coordinate_table_df[['z', 'y', 'x']].values

        rescale = config["options"]["rescale-points-to-level"]
        if rescale != 0:
            points //= 2**rescale

        # All points must lie within the input volume        
        points_box = [points.min(axis=0), 1+points.max(axis=0)]
        if (box_intersection(points_box, volume_service.bounding_box_zyx) != points_box).all():
            raise RuntimeError("The point list includes points outside of the volume bounding box.")

        with Timer("Sorting points by Brick ID", logger):
            # 'Brick ID' is defined as the divided corner coordinate 
            brick_shape = volume_service.preferred_message_shape
            brick_ids_and_points = np.concatenate( (points // brick_shape, points), axis=1 )
            brick_ids_and_points = lexsort_columns(brick_ids_and_points)

            brick_ids = brick_ids_and_points[: ,:3]
            points = brick_ids_and_points[:, 3:]
            
            # Extract the first row of each group to get the set of unique brick IDs
            point_group_spans = groupby_spans_presorted(brick_ids)
            point_group_starts = (start for start, stop in point_group_spans)
            unique_brick_ids = brick_ids[np.fromiter(point_group_starts, np.int32)]

        with Timer("Distributing points", logger):
            # This is faster than pandas.DataFrame.groupby() for large data
            point_groups = groupby_presorted(points, brick_ids)
            id_and_ptgroup = self.sc.parallelize(zip(map(tuple, unique_brick_ids), point_groups))
        
        with Timer("Constructing sparse mask", logger):
            # BrickWall.from_volume_service() supports the ability to initialize a sparse RDD,
            # with only a subset of Bricks (rather than a dense RDD containing every brick
            # within the volume bounding box).
            # It requires a SparseBlockMask object indicating exactly which Bricks need to be fetched.
            brick_mask_box = np.array([unique_brick_ids.min(axis=0), 1+unique_brick_ids.max(axis=0)])

            brick_mask_shape = (brick_mask_box[1] - brick_mask_box[0])
            brick_mask = np.zeros(brick_mask_shape, bool)
            brick_mask_coords = unique_brick_ids - brick_mask_box[0]
            brick_mask[tuple(brick_mask_coords.transpose())] = True
            sbm = SparseBlockMask(brick_mask, brick_mask_box*brick_shape, brick_shape)

        with Timer("Initializing BrickWall", logger):
            # Aim for 2 GB RDD partitions when loading segmentation
            GB = 2**30
            target_partition_size_voxels = 2 * GB // np.uint64().nbytes
            brickwall = BrickWall.from_volume_service(volume_service, 0, None, self.sc, target_partition_size_voxels, sbm, lazy=True)
        
        with Timer("Joining point groups with bricks", logger):
            id_and_brick = brickwall.bricks.map(lambda brick: (tuple(brick.logical_box[0] // brick_shape), brick))
            brick_and_ptgroup = id_and_brick.join(id_and_ptgroup).values() # discard id

        def sample_points(brick_and_points):
            """
            Given a Brick and array of points (N,3) that lie within it,
            sample labels from the points within the brick and return
            a record array containing the points and the sampled labels.
            """
            brick, points = brick_and_points

            result_dtype = [('z', np.int32), ('y', np.int32), ('x', np.int32), ('label', np.uint64)]
            result = np.zeros((len(points),), result_dtype)
            result['z'] = points[:,0]
            result['y'] = points[:,1]
            result['x'] = points[:,2]

            # Make relative to brick offset
            points -= brick.physical_box[0]
            
            result['label'] = brick.volume[tuple(points.transpose())]
            return result

        with Timer("Sampling bricks", logger):
            brick_samples = brick_and_ptgroup.map(sample_points).collect()

        with Timer("Concatenating samples", logger):
            sample_table = np.concatenate(brick_samples)

        with Timer("Sorting samples", logger):
            sample_table.sort()

        with Timer("Sorting table", logger):
            if rescale == 0:
                coordinate_table_df.sort_values(['z', 'y', 'x'], inplace=True)
            else:
                # sample_table is sorted by RESCALED coordiante,
                # so sort our table the same way
                coordinate_table_df['rz'] = coordinate_table_df['z'] // (2**rescale)
                coordinate_table_df['ry'] = coordinate_table_df['y'] // (2**rescale)
                coordinate_table_df['rx'] = coordinate_table_df['x'] // (2**rescale)
                coordinate_table_df.sort_values(['rz', 'ry', 'rx'], inplace=True)
                del coordinate_table_df['rz']
                del coordinate_table_df['ry']
                del coordinate_table_df['rx']

        # Now that samples and input rows are sorted identically,
        # append the results
        output_col = options["output-column"]
        coordinate_table_df[output_col] = sample_table['label']

        with Timer("Exporting samples", logger):
            coordinate_table_df.to_csv(config["options"]["output-table"], header=True, index=False)

        logger.info("DONE.")

Beispiel #11

0

Datei anzeigen

Datei: ingest_label_indexes.py Projekt: janelia-flyem/DVIDSparkServices

 def count_groups():
     num_groups = 0
     for _ in groupby_presorted(a, sorted_cols):
         num_groups += 1
     return num_groups