Python iter_batches Examples, neuclease.util.iter_batches Python Examples

Example #1

0

Show file

File: test_util.py Project: y2mk1ng/neuclease

def test_iter_batches():
    data = range(10)

    # If data supports len, we support it, and if not, we don't.
    assert hasattr(iter_batches(data, 3), '__len__')
    assert not hasattr(iter_batches(iter(data), 3), '__len__')
    assert len(iter_batches(data, 3)) == 4

    assert [*iter_batches(data, 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
    assert [*iter_batches(iter(data), 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8],
                                              [9]]

    data = list(data)
    assert [*iter_batches(data, 3)] == [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

    data = np.array(data)
    assert [a.tolist() for a in iter_batches(data, 3)] == [[0, 1,
                                                            2], [3, 4, 5],
                                                           [6, 7, 8], [9]]

    data = pd.Series(data)
    assert [a.tolist() for a in iter_batches(data, 3)] == [[0, 1,
                                                            2], [3, 4, 5],
                                                           [6, 7, 8], [9]]

    data = pd.DataFrame(data, columns=['a'])
    assert [df['a'].tolist() for df in iter_batches(data, 3)] == [[0, 1, 2],
                                                                  [3, 4, 5],
                                                                  [6, 7, 8],
                                                                  [9]]

Example #2

0

Show file

    def append_synapse_columns(self, body_table, neuprint_info):
        server, dataset = neuprint_info["server"], neuprint_info["dataset"]
        if not server or not dataset:
            return body_table

        from neuprint import Client, default_client as neuprint_default_client, fetch_neurons, NeuronCriteria as NC

        @auto_retry(5, pause_between_tries=3.0, logging_name=__name__)
        def fetch_synapse_counts(bodies):
            try:
                c = neuprint_default_client()
            except Exception:
                c = Client(server, dataset)

            ndf, cdf = fetch_neurons(NC(bodyId=bodies, label='Segment'),
                                     client=c)
            return ndf.set_index('bodyId')[['pre', 'post']].rename_axis('body')

        bag = db.from_sequence(iter_batches(body_table.index.values, 1000),
                               npartitions=16)
        sc_dfs = bag.map(fetch_synapse_counts).compute()
        sc_df = pd.concat(sc_dfs)

        body_table = body_table.merge(sc_df, 'left', on='body')
        body_table['pre'] = body_table['pre'].fillna(0.0).astype(int)
        body_table['post'] = body_table['post'].fillna(0.0).astype(int)
        return body_table

Example #3

0

Show file

def edges_to_assignments(df, gray_source, seg_source, sv_as_body=False, batch_size=100, output_path=None, *, shuffle=False, description=""):
    if isinstance(df, str):
        df = pd.read_csv(df)
    assert isinstance(df, pd.DataFrame)

    dupes = df.duplicated(['sv_a', 'sv_b']).sum()
    if dupes:
        print(f"Dropping {dupes} duplicate tasks!")
        df = df.drop_duplicates(['sv_a', 'sv_b'])
        print(f"Writing {len(df)} tasks")

    if shuffle:
        print("Shuffling task order")
        df = df.sample(frac=1)

    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    assignments = []
    for i, batch_df in enumerate(iter_batches(df, batch_size)):
        if output_path:
            base, _ = os.path.splitext(output_path)
            batch_path = f"{base}-{i:03d}.json"
        else:
            batch_path = None

        a = edges_to_assignment(batch_df, gray_source, seg_source, sv_as_body, batch_path, description=description)
        assignments.append(a)

Example #4

0

Show file

File: masksegmentation.py Project: janelia-flyem/flyemflows

    def _execute_scale(self, scale, starting_batch, mask_s5, mask_box_s5):
        options = self.config["masksegmentation"]
        block_width = self.output_service.block_width

        def scale_box(box, scale):
            # Scale down, then round up to the nearest multiple of the block width
            box = np.ceil(box / 2**scale).astype(np.int32)
            return round_box(box, block_width)

        # bounding box of the segmentation at the current scale.
        bounding_box = scale_box(self.input_service.bounding_box_zyx, scale)

        # Don't make bricks that are wider than the bounding box at this scale
        brick_shape = np.minimum(self.input_service.preferred_message_shape,
                                 bounding_box[1])
        assert not (brick_shape % block_width).any()

        brick_boxes = boxes_from_grid(bounding_box, brick_shape, clipped=True)

        with Timer(f"Scale {scale}: Preparing bricks", logger):
            boxes_and_masks = []
            for box in brick_boxes:
                mask_block_box = ((box // 2**(5 - scale)) - mask_box_s5[0])
                mask_block_box = mask_block_box.astype(
                    np.int32)  # necessary when scale is > 5
                mask_block_s5 = np.zeros(box_shape(mask_block_box), bool)
                mask_block_s5 = extract_subvol(mask_s5, mask_block_box)
                if mask_block_s5.any():
                    boxes_and_masks.append((box, mask_block_s5))

        batches = [*iter_batches(boxes_and_masks, options["batch-size"])]

        if starting_batch == 0:
            logger.info(f"Scale {scale}: Processing {len(batches)} batches")
        else:
            logger.info(
                f"Scale {scale}: Processing {len(batches) - starting_batch} "
                f"remaining batches from {len(batches)} original batches")

            assert starting_batch < len(batches), \
                f"Can't start at batch {starting_batch}; there are only {len(batches)} in total."
            batches = batches[starting_batch:]

        for batch_index, batch_boxes_and_masks in enumerate(
                batches, start=starting_batch):
            with Timer(f"Scale {scale}: Batch {batch_index:02d}", logger):
                self._execute_batch(scale, batch_index, batch_boxes_and_masks)

Example #5

0

Show file

File: sort_stats.py Project: janelia-flyem/flyemflows

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--split-into-batches', type=int,
                        help='If given, also split the body stats into this many batches of roughly equal size')
    parser.add_argument('server')
    parser.add_argument('src_uuid')
    parser.add_argument('labelmap_instance')
    parser.add_argument('supervoxel_block_stats_h5',
                        help=f'An HDF5 file with a single dataset "stats", with dtype: {STATS_DTYPE[1:]} (Note: No column for body_id)')
    args = parser.parse_args()

    configure_default_logging()
    initialize_excepthook()
    (block_sv_stats, _presorted_by, _agglo_path) = load_stats_h5_to_records(args.supervoxel_block_stats_h5)

    src_info = (args.server, args.src_uuid, args.labelmap_instance)
    mapping = fetch_mappings(*src_info)

    assert isinstance(mapping, pd.Series)
    mapping_df = mapping.reset_index().rename(columns={'sv': 'segment_id', 'body': 'body_id'})

    # sorts in-place, and saves a copy to hdf5
    sort_block_stats( block_sv_stats,
                      mapping_df,
                      args.supervoxel_block_stats_h5[:-3] + '-sorted-by-body.h5',
                      '<fetched-from-dvid>')
    
    if args.split_into_batches:
        num_batches = args.split_into_batches
        batch_size = int(np.ceil(len(block_sv_stats) / args.split_into_batches))
        logger.info(f"Splitting into {args.split_into_batches} batches of size ~{batch_size}")
        os.makedirs('stats-batches', exist_ok=True)
        
        body_spans = groupby_spans_presorted(block_sv_stats['body_id'][:, None])
        for batch_index, batch_spans in enumerate(tqdm_proxy(iter_batches(body_spans, batch_size))):
            span_start, span_stop = batch_spans[0][0], batch_spans[-1][1]
            batch_stats = block_sv_stats[span_start:span_stop]
            digits = int(np.ceil(np.log10(num_batches)))
            batch_path = ('stats-batches/stats-batch-{:0' + str(digits) + 'd}.h5').format(batch_index)
            save_stats(batch_stats, batch_path)
    
    logger.info("DONE sorting stats by body")

Example #6

0

Show file

File: maskedcopy.py Project: janelia-flyem/flyemflows

    def execute(self):
        options = self.config["maskedcopy"]
        input_service, mask_service, output_service = self.init_services()

        def _masked_copy(box):
            seg_vol = input_service.get_subvolume(box)
            mask_vol = mask_service.get_subvolume(box).astype(bool)
            seg_vol[~mask_vol] = 0
            output_service.write_subvolume(seg_vol, box[0])
            return (*box[0], mask_vol.sum())

        # Boxes are determined by the left volume/labels/roi
        boxes = self.init_boxes(input_service, options["roi"])

        batches = iter_batches(boxes, options["batch-size"])
        logger.info(f"Performing masked copy of {len(boxes)} bricks in total.")
        logger.info(
            f"Processing {len(batches)} batches of {options['batch-size']} bricks each."
        )

        os.makedirs('mask-stats', exist_ok=True)

        for batch_index, batch_boxes in enumerate(batches):
            if batch_index < options["restart-at-batch"]:
                logger.info(f"Batch {batch_index}: Skipping")
                continue

            with Timer(f"Batch {batch_index}: Copying", logger):
                # Aim for 4 partitions per worker
                total_cores = sum(self.client.ncores().values())
                brick_counts = (db.from_sequence(
                    batch_boxes,
                    npartitions=4 * total_cores).map(_masked_copy).compute())

                brick_counts_df = pd.DataFrame(brick_counts,
                                               columns=[*'zyx', 'mask_voxels'])
                brick_counts_df.to_csv(
                    f'mask-stats/batch-{batch_index:03d}-brick-mask-voxels.csv',
                    header=True,
                    index=False)

Example #7

0

Show file

File: evaluate_mito_count_results.py Project: janelia-flyem/neuclease

def main():
    RESULTS_PKL_PATH = sys.argv[1]
    if len(sys.argv) == 3:
        PROCESSES = int(sys.argv[2])
    else:
        PROCESSES = 4

    # Calculate the difference in resolution between the stored mito segmentation and neuron segmenation.
    # If they differ, it must be by a power of 2.
    mito_res = fetch_info(*MITO_SEG)["Extended"]["VoxelSize"][0]
    assert mito_res % NEIGHBORHOOD_RES == 0
    assert np.log2(mito_res / NEIGHBORHOOD_RES) == int(np.log2(mito_res / NEIGHBORHOOD_RES)), \
        "This script assumes that the mito resolution and neighborhood resolution differ by a power of 2."
    mito_res_scale_diff = int(np.log2(mito_res // NEIGHBORHOOD_RES))

    with open(RESULTS_PKL_PATH, 'rb') as f:
        mc_df = pickle.load(f)

    new_names = {col: col.replace(' ', '_') for col in mc_df.columns}
    new_names['result'] = 'proofreader_count'
    mc_df = mc_df.rename(columns=new_names)

    print("Evaluating mito count results")
    results = compute_parallel(partial(_task_results, mito_res_scale_diff),
                               iter_batches(
                                   mc_df.drop_duplicates('neighborhood_id'),
                                   1),
                               total=len(mc_df),
                               processes=PROCESSES,
                               leave_progress=True,
                               ordered=False)

    cols = [
        'neighborhood_id', 'neighborhood_origin', 'proofreader_count',
        'mito_id_count', 'mito_ids', 'mito_sizes', 'num_ccs', 'mito_cc_ids',
        'mito_cc_sizes', 'ng_link'
    ]

    df = pd.DataFrame(results, columns=cols)

    # Add columns for cell type (from neuprint)
    print("Fetching neuron cell types")
    origins_df = pd.DataFrame(df['neighborhood_origin'].tolist(),
                              columns=[*'xyz'])
    df['body'] = fetch_labels_batched(*NEURON_SEG,
                                      origins_df[[*'zyx']].values,
                                      processes=8)
    neurons_df, _ = fetch_neurons(df['body'].unique())
    neurons_df = neurons_df.rename(columns={
        'bodyId': 'body',
        'type': 'body_type',
        'instance': 'body_instance'
    })
    df = df.merge(neurons_df[['body', 'body_type', 'body_instance']],
                  'left',
                  on='body')
    df['body_type'].fillna("", inplace=True)
    df['body_instance'].fillna("", inplace=True)

    # Append roi column
    print("Determining ROIs")
    determine_point_rois(*NEURON_SEG[:2], NEUPRINT_CLIENT.primary_rois,
                         origins_df)
    df['roi'] = origins_df['roi']

    # Results only
    path = 'mito-seg-counts.pkl'
    print(f"Writing {path}")
    with open(path, 'wb') as f:
        pickle.dump(df, f)

    path = 'mito-seg-counts.tab-delimited.csv'
    print(f"Writing {path}")
    df.to_csv(path, sep='\t', header=True, index=False)

    # Full results (with task info columns)
    df = df.merge(
        mc_df.drop(columns=['neighborhood_origin', 'proofreader_count']),
        'left',
        on='neighborhood_id')

    path = 'full-results-with-mito-seg-counts.pkl'
    print(f"Writing {path}")
    with open(path, 'wb') as f:
        pickle.dump(df, f)

    path = 'full-results-with-mito-seg-counts.tab-delimited.csv'
    print(f"Writing {path}")
    df.to_csv(path, sep='\t', header=True, index=False)

    print("DONE")

Example #8

0

Show file

File: contingencytable.py Project: davidackerman/flyemflows

    def execute(self):
        self.init_services()

        left_service = self.left_service
        right_service = self.right_service
        options = self.config["contingencytable"]

        left_is_supervoxels = False
        if isinstance(left_service.base_service, DvidVolumeService):
            left_is_supervoxels = left_service.base_service.supervoxels

        left_roi = options["left-roi"]
        left_subset_labels = load_body_list(options["left-subset-labels"],
                                            left_is_supervoxels)
        left_subset_labels = set(left_subset_labels)
        sparse_fetch = not options["skip-sparse-fetch"]

        # Boxes are determined by the left volume/labels/roi
        boxes = self.init_boxes(left_service, sparse_fetch
                                and left_subset_labels, left_roi)

        def _contingency_table(box):
            left_vol = left_service.get_subvolume(box)
            right_vol = right_service.get_subvolume(box)
            table = contingency_table(left_vol, right_vol)
            return table.reset_index()

        batch_tables = []
        batches = iter_batches(boxes, options["batch-size"])
        logger.info(
            f"Computing contingency tables for {len(boxes)} bricks in total.")
        logger.info(
            f"Processing {len(batches)} batches of {options['batch-size']} bricks each."
        )

        for batch_index, batch_boxes in enumerate(batches):
            with Timer(f"Batch {batch_index}: Computing tables", logger):
                # Aim for 4 partitions per worker
                total_cores = sum(self.client.ncores().values())
                tables = (db.from_sequence(
                    batch_boxes, npartitions=4 *
                    total_cores).map(_contingency_table).compute())

                table = pd.concat(tables, ignore_index=True).sort_values(
                    ['left', 'right']).reset_index(drop=True)
                table = table.groupby(['left', 'right'],
                                      as_index=False,
                                      sort=False)['voxel_count'].sum()

            batch_tables.append(table)

        with Timer("Constructing final table", logger):
            final_table = pd.concat(batch_tables,
                                    ignore_index=True).sort_values(
                                        ['left',
                                         'right']).reset_index(drop=True)
            final_table = final_table.groupby(['left', 'right'],
                                              as_index=False,
                                              sort=False)['voxel_count'].sum()

        with Timer("Writing contingency_table.npy", logger):
            np.save('contingency_table.npy',
                    final_table.to_records(index=False))

Example #9

0

Show file

    def execute(self):
        options = self.config["mitodistances"]
        output_dir = self.config["output-directory"]
        body_svc, mito_svc = self.init_services()

        # Resource manager context must be initialized before resource manager client
        # (to overwrite config values as needed)
        dvid_mgr_config = self.config["dvid-access-manager"]
        dvid_mgr_context = LocalResourceManager(dvid_mgr_config)
        dvid_mgr_client = ResourceManagerClient(dvid_mgr_config["server"],
                                                dvid_mgr_config["port"])

        syn_server, syn_uuid, syn_instance = (options['synapse-criteria'][k]
                                              for k in ('server', 'uuid',
                                                        'instance'))
        syn_conf = float(options['synapse-criteria']['confidence'])
        syn_types = ['PreSyn', 'PostSyn']
        if options['synapse-criteria']['type'] == 'pre':
            syn_types = ['PreSyn']
        elif options['synapse-criteria']['type'] == 'post':
            syn_types = ['PostSyn']

        bodies = load_body_list(options["bodies"], False)
        skip_flags = [
            os.path.exists(f'{output_dir}/{body}.csv') for body in bodies
        ]
        bodies_df = pd.DataFrame({'body': bodies, 'should_skip': skip_flags})
        bodies = bodies_df.query('not should_skip')['body']

        # Shuffle for better load balance?
        # TODO: Would be better to sort by synapse count, and put large bodies first,
        #       assigned to partitions in round-robin style.
        #       Then work stealing will be more effective at knocking out the smaller jobs at the end.
        #       This requires knowing all the body sizes, though.
        #       Perhaps mito count would be a decent proxy for synapse count, and it's readily available.
        #bodies = bodies.sample(frac=1.0).values

        os.makedirs('body-logs')
        os.makedirs(output_dir, exist_ok=True)

        mito_server, mito_uuid, mito_instance = (options['mito-labelmap'][k]
                                                 for k in ('server', 'uuid',
                                                           'instance'))

        @auto_retry(3)
        def _fetch_synapses(body):
            with dvid_mgr_client.access_context(syn_server, True, 1, 1):
                syn_df = fetch_annotation_label(syn_server,
                                                syn_uuid,
                                                syn_instance,
                                                body,
                                                format='pandas')
                if len(syn_df) == 0:
                    return syn_df
                syn_types, syn_conf
                syn_df = syn_df.query(
                    'kind in @syn_types and conf >= @syn_conf').copy()
                return syn_df[[*'xyz', 'kind', 'conf'
                               ]].sort_values([*'xyz']).reset_index(drop=True)

        @auto_retry(3)
        def _fetch_mito_ids(body):
            with dvid_mgr_client.access_context(mito_server, True, 1, 1):
                try:
                    return fetch_supervoxels(mito_server, mito_uuid,
                                             mito_instance, body)
                except HTTPError:
                    return []

        def process_and_save(body):
            tbars = _fetch_synapses(body)
            valid_mitos = _fetch_mito_ids(body)

            # TODO:
            #   Does the stdout_redirected() mechanism work correctly in the context of multiprocessing?
            #   If not, I should probably just use a custom logging handler instead.
            with open(f"body-logs/{body}.log",
                      "w") as f, stdout_redirected(f), Timer() as timer:
                processed_tbars = []
                if len(tbars) == 0:
                    logging.getLogger(__name__).warning(
                        f"Body {body}: No synapses found")

                if len(valid_mitos) == 0:
                    logging.getLogger(__name__).warning(
                        f"Body {body}: Failed to fetch mito supervoxels")
                    processed_tbars = initialize_results(body, tbars)

                if len(valid_mitos) and len(tbars):
                    processed_tbars = measure_tbar_mito_distances(
                        body_svc,
                        mito_svc,
                        body,
                        tbars=tbars,
                        valid_mitos=valid_mitos)

            if len(processed_tbars) > 0:
                processed_tbars.to_csv(f'{output_dir}/{body}.csv',
                                       header=True,
                                       index=False)
                with open(f'{output_dir}/{body}.pkl', 'wb') as f:
                    pickle.dump(processed_tbars, f)

            if len(tbars) == 0:
                return (body, 0, 'no-synapses', timer.seconds)

            if len(valid_mitos) == 0:
                return (body, len(processed_tbars), 'no-mitos', timer.seconds)

            return (body, len(tbars), 'success', timer.seconds)

        logger.info(
            f"Processing {len(bodies)}, skipping {bodies_df['should_skip'].sum()}"
        )

        def process_batch(bodies):
            return [*map(process_and_save, bodies)]

        with dvid_mgr_context:
            batch_size = max(1, len(bodies) // 10_000)
            futures = self.client.map(process_batch,
                                      iter_batches(bodies, batch_size))

            # Support synchronous testing with a fake 'as_completed' object
            if hasattr(self.client, 'DEBUG'):
                ac = as_completed_synchronous(futures, with_results=True)
            else:
                ac = distributed.as_completed(futures, with_results=True)

            try:
                results = []
                for f, r in tqdm_proxy(ac, total=len(futures)):
                    results.extend(r)
            finally:
                results = pd.DataFrame(
                    results,
                    columns=['body', 'synapses', 'status', 'processing_time'])
                results.to_csv('results-summary.csv', header=True, index=False)
                num_errors = len(results.query('status == "error"'))
                if num_errors:
                    logger.warning(
                        f"Encountered {num_errors} errors. See results-summary.csv"
                    )

Example #10

0

Show file

File: roistats.py Project: aplbrain/flyemflows

    def execute(self):
        scale = self._init_service()

        options = self.config["roistats"]
        server = self.input_service.base_service.server
        uuid = self.input_service.base_service.uuid
        rois = options["rois"]

        bodies = load_body_list(options["subset-bodies"],
                                self.input_service.base_service.supervoxels)
        assert len(
            bodies) > 0, "Please provide a list of subset-bodies to process"

        bounding_box = self.input_service.bounding_box_zyx
        assert not (bounding_box % 2**(5-scale)).any(), \
            "Make sure your configured bounding box is divisible by 32px at scale 0"
        brick_shape = self.input_service.preferred_message_shape
        assert not (brick_shape % 2**(5-scale)).any(), \
            "Make sure your preferred message shape divides into 32px blocks at scale 0"

        with Timer("Fetching ROI volume", logger):
            roi_vol_s5, roi_box_s5, overlaps = fetch_combined_roi_volume(
                server, uuid, rois, False, bounding_box // 2**(5 - scale))

        if len(overlaps) > 0:
            logger.warn(
                f"Some of your ROIs overlap!  Here's an incomplete list:\n{overlaps}"
            )

        with Timer("Determining brick set", logger):
            brick_coords_df = self.input_service.sparse_brick_coords_for_labels(
                bodies)
            np.save('brick-coords.npy',
                    brick_coords_df.to_records(index=False))

        with Timer(f"Preparing bricks", logger):
            boxes_and_roi_bricks = []
            for coord, labels in brick_coords_df.groupby(
                [*'zyx'])['label'].agg(tuple).iteritems():
                box = np.array((coord, coord))
                box[1] += brick_shape
                box = box_intersection(box, bounding_box)

                roi_brick_box = ((box // 2**(5 - scale)) - roi_box_s5[0])
                roi_brick_s5 = extract_subvol(roi_vol_s5, roi_brick_box)
                boxes_and_roi_bricks.append((box, roi_brick_s5, labels))

        logger.info(
            f"Prepared {len(boxes_and_roi_bricks)} bricks of shape {(*brick_shape[::-1],)}"
        )

        all_stats = []
        batches = [*iter_batches(boxes_and_roi_bricks, options["batch-size"])]
        logger.info(f"Processing {len(batches)} batches")
        for i, batch_boxes_and_bricks in enumerate(batches):
            with Timer(f"Batch {i:02d}", logger):
                batch_stats = self._execute_batch(scale,
                                                  batch_boxes_and_bricks)
                all_stats.append(batch_stats)

        all_stats = pd.concat(all_stats, ignore_index=True)
        all_stats = all_stats.groupby(['body', 'roi_id'],
                                      as_index=False)['voxels'].sum()

        roi_names = pd.Series(["<none>", *rois], name='roi')
        roi_names.index.name = 'roi_id'
        all_stats = all_stats.merge(roi_names, 'left', on='roi_id')
        all_stats = all_stats.sort_values(['body', 'roi_id'])

        if scale > 0:
            all_stats.rename(columns={'voxels': f'voxels_s{scale}'},
                             inplace=True)

        with Timer(f"Writing stats ({len(all_stats)} rows)", logger):
            np.save('roi-stats.npy', all_stats.to_records(index=False))
            all_stats.to_csv('roi-stats.csv', index=False, header=True)

Example #11

0

Show file

    def execute(self):
        self.init_services()

        primary_service = self.primary_service
        contingency_service = self.contingency_service
        output_service = self.output_service
        options = self.config["contingentrelabel"]

        primary_is_supervoxels = False
        if isinstance(primary_service.base_service, DvidVolumeService):
            primary_is_supervoxels = primary_service.base_service.supervoxels

        roi = options["roi"]
        subset_labels = load_body_list(options["subset-labels"],
                                       primary_is_supervoxels)
        subset_labels = set(subset_labels)
        sparse_fetch = not options["skip-sparse-fetch"]

        # Boxes are determined by the primary volume/labels/roi
        boxes = self.init_boxes(primary_service, sparse_fetch
                                and subset_labels, roi)

        batches = iter_batches(boxes, options["batch-size"])
        logger.info(f"Relabeling {len(boxes)} bricks in total.")
        logger.info(
            f"Processing {len(batches)} batches of {options['batch-size']} bricks each."
        )

        def _contingent_relabel(box):
            primary_vol = primary_service.get_subvolume(box)
            primary_vol = np.ascontiguousarray(primary_vol)

            contingency_vol = contingency_service.get_subvolume(box)
            contingency_vol = np.ascontiguousarray(contingency_vol)

            # Get the set of labels in this box, so we can discard irrelevant portions of the mapping.
            _primary_labels = pd.unique(primary_vol.reshape(-1))  # noqa
            _contingency_labels = pd.unique(
                contingency_vol.reshape(-1))  # noqa

            cm_path = options["contingent-mapping"]
            if cm_path.endswith('.npy'):
                _cm = np.load(options["contingent-mapping"])
            elif cm_path.endswith('.pkl'):
                _cm = pickle.load(open(cm_path, 'rb'))
            else:
                raise RuntimeError(
                    f"Don't know how to open mapping file: {cm_path}")

            cm_df = pd.DataFrame(_cm)
            assert {*cm_df.columns} == {'primary', 'contingency', 'final'}

            # Keep only the parts of the mapping we need for this box,
            # just for the sake of performance in the merge below.
            cm_df = cm_df.query(
                'primary in @_primary_labels and contingency in @_contingency_labels'
            ).copy()
            cm_df['primary'] = cm_df['primary'].astype(primary_vol.dtype)
            cm_df['contingency'] = cm_df['contingency'].astype(
                contingency_vol.dtype)

            # Use a merge to essentially map from (primary, contingency) -> final
            input_df = pd.DataFrame({
                'primary': primary_vol.reshape(-1),
                'contingency': contingency_vol.reshape(-1)
            })
            input_df = input_df.merge(cm_df,
                                      'left',
                                      on=['primary', 'contingency'])
            input_df['final'] = input_df['final'].fillna(input_df['primary'])
            input_df['final'] = input_df['final'].astype(primary_vol.dtype)

            final_vol = input_df['final'].values.reshape(primary_vol.shape)
            del input_df

            output_service.write_subvolume(final_vol, box[0])

        for batch_index, batch_boxes in enumerate(batches):
            with Timer(f"Batch {batch_index}: Relabeling", logger):
                # Aim for 4 partitions per worker
                total_cores = sum(self.client.ncores().values())
                (db.from_sequence(
                    batch_boxes, npartitions=4 *
                    total_cores).map(_contingent_relabel).compute())

Example #12

0

Show file

File: contingencytable.py Project: janelia-flyem/flyemflows

    def execute(self):
        self.init_services()

        left_service = self.left_service
        right_service = self.right_service
        options = self.config["contingencytable"]

        left_is_supervoxels = False
        if isinstance(left_service.base_service, DvidVolumeService):
            left_is_supervoxels = left_service.base_service.supervoxels

        left_roi = options["left-roi"]
        left_subset_labels = load_body_list(options["left-subset-labels"],
                                            left_is_supervoxels)
        sparse_fetch = not options["skip-sparse-fetch"]
        min_overlap = options["min-overlap-size"]

        # Boxes are determined by the left volume/labels/roi
        boxes = self.init_boxes(left_service, sparse_fetch
                                and set(left_subset_labels), left_roi)

        def _contingency_table(box):
            left_vol = left_service.get_subvolume(box)
            right_vol = right_service.get_subvolume(box)

            table = contingency_table(left_vol, right_vol)
            table = table.sort_index().reset_index()

            # Compute sizes before filtering
            left_sizes = table.groupby('left')['voxel_count'].sum()
            right_sizes = table.groupby('right')['voxel_count'].sum()

            if len(left_subset_labels) > 0:
                # We keep rows if they match either of these criteria:
                #   1. they touch a left-subset label
                #   2. they touch a left label that intersects with one
                #      of the right labels from criteria 1.
                keep_left = left_sizes.index.intersection(
                    left_subset_labels)  # noqa
                keep_right = table.query(
                    'left in @keep_left')['right'].unique()  # noqa
                table = table.query(
                    'left in @keep_left or right in @keep_right')

            if min_overlap > 1:
                table = table.query('voxel_count >= @min_overlap')

            left_sizes = left_sizes.loc[table['left'].unique()].reset_index()
            right_sizes = right_sizes.loc[
                table['right'].unique()].reset_index()

            return table, left_sizes, right_sizes

        batch_tables = []
        batch_left_sizes = []
        batch_right_sizes = []
        batches = iter_batches(boxes, options["batch-size"])
        logger.info(
            f"Computing contingency tables for {len(boxes)} bricks in total.")
        logger.info(
            f"Processing {len(batches)} batches of {options['batch-size']} bricks each."
        )

        for batch_index, batch_boxes in enumerate(batches):
            with Timer(f"Batch {batch_index}: Computing tables", logger):
                # Aim for 4 partitions per worker
                total_cores = sum(self.client.ncores().values())
                results = (db.from_sequence(
                    batch_boxes, npartitions=4 *
                    total_cores).map(_contingency_table).compute())

                tables, left_sizes, right_sizes = zip(*results)
                table = pd.concat(tables, ignore_index=True).sort_values(
                    ['left', 'right']).reset_index(drop=True)
                table = table.groupby(['left', 'right'],
                                      as_index=False,
                                      sort=False)['voxel_count'].sum()

                left_sizes = pd.concat(left_sizes, ignore_index=True).groupby(
                    'left')['voxel_count'].sum().reset_index()
                right_sizes = pd.concat(
                    right_sizes, ignore_index=True).groupby(
                        'right')['voxel_count'].sum().reset_index()

                batch_tables.append(table)
                batch_left_sizes.append(left_sizes)
                batch_right_sizes.append(right_sizes)

        with Timer("Constructing final tables", logger):
            final_table = pd.concat(batch_tables,
                                    ignore_index=True).sort_values(
                                        ['left',
                                         'right']).reset_index(drop=True)
            final_table = final_table.groupby(['left', 'right'],
                                              as_index=False,
                                              sort=False)['voxel_count'].sum()

            final_left_sizes = pd.concat(
                batch_left_sizes,
                ignore_index=True).groupby('left')['voxel_count'].sum()
            final_right_sizes = pd.concat(
                batch_right_sizes,
                ignore_index=True).groupby('right')['voxel_count'].sum()

        def dump_table(table, p):
            with Timer(f"Writing {p}", logger), open(p, 'wb') as f:
                pickle.dump(table, f)

        dump_table(final_table, 'contingency_table.pkl')
        dump_table(final_left_sizes, 'left_sizes.pkl')
        dump_table(final_right_sizes, 'right_sizes.pkl')

Example #13

0

Show file

File: roistats.py Project: janelia-flyem/flyemflows

    def execute(self):
        self._init_service()
        options = self.config["roistats"]

        if not options["roi-server"]:
            assert isinstance(self.input_service, DvidVolumeService)
            options["roi-server"] = self.input_service.base_service.server

        if not options["roi-uuid"]:
            assert isinstance(self.input_service, DvidVolumeService)
            options["roi-uuid"] = self.input_service.base_service.uuid

        options["roi-uuid"] = resolve_ref(options["roi-server"],
                                          options["roi-uuid"])

        is_supervoxels = (isinstance(self.input_service, DvidVolumeService)
                          and self.input_service.base_service.supervoxels
                          )  # noqa
        bodies = load_body_list(options["subset-bodies"], is_supervoxels)
        assert len(
            bodies) > 0, "Please provide a list of subset-bodies to process"

        scale = options["analysis-scale"]
        bounding_box = self.input_service.bounding_box_zyx
        assert not (bounding_box % 2**5).any(), \
            "Make sure your configured bounding box is divisible by 32px at scale 0."
        brick_shape = self.input_service.preferred_message_shape
        assert not (brick_shape % 2**5).any(), \
            "Make sure your preferred message shape divides into 32px blocks at scale 0"

        with Timer("Fetching ROI volume", logger):
            roi_vol_s5, roi_box_s5, overlaps = fetch_combined_roi_volume(
                options["roi-server"], options["roi-uuid"], options["rois"],
                False, bounding_box // 2**5)

        if len(overlaps) > 0:
            logger.warn(
                f"Some of your ROIs overlap!  Here's an incomplete list:\n{overlaps}"
            )

        with Timer("Determining brick set", logger):
            # Determine which bricks intersect our ROIs
            roi_brick_shape = self.input_service.preferred_message_shape // 2**5
            roi_brick_boxes = boxes_from_mask((roi_vol_s5 != 0),
                                              roi_box_s5[0],
                                              roi_brick_shape,
                                              clipped=False)
            roi_brick_boxes *= 2**5
            roi_brick_boxes = box_intersection(
                roi_brick_boxes, self.input_service.bounding_box_zyx)

            # Non-intersecting boxes have negative shape -- drop them.
            roi_brick_boxes = roi_brick_boxes[(
                (roi_brick_boxes[:, 1, :] - roi_brick_boxes[:, 0, :]) > 0).all(
                    axis=1)]
            roi_brick_coords_df = pd.DataFrame(roi_brick_boxes[:, 0, :],
                                               columns=[*'zyx'])
            try:
                body_brick_coords_df = self.input_service.sparse_brick_coords_for_labels(
                    bodies)
            except NotImplementedError:
                # Use all bricks in the ROIs, and use the special label -1 to
                # indicate that all bodies in the list might be found there.
                # (See below.)
                brick_coords_df = roi_brick_coords_df
                brick_coords_df['label'] = -1
            else:
                brick_coords_df = body_brick_coords_df.merge(
                    roi_brick_coords_df, 'inner', on=[*'zyx'])

            assert brick_coords_df.columns.tolist() == [*'zyx', 'label']
            np.save('brick-coords.npy',
                    brick_coords_df.to_records(index=False))

        with Timer("Preparing bricks", logger):
            boxes_and_roi_bricks = []
            for coord, brick_labels in brick_coords_df.groupby(
                [*'zyx'])['label'].agg(tuple).iteritems():
                if brick_labels == (-1, ):
                    # No sparse body brick locations were found above.
                    # Search for all bodies in all bricks.
                    brick_labels = bodies

                box = np.array((coord, coord))
                box[1] += brick_shape
                box = box_intersection(box, bounding_box)

                roi_brick_box = ((box // 2**5) - roi_box_s5[0])
                roi_brick_s5 = extract_subvol(roi_vol_s5, roi_brick_box)
                boxes_and_roi_bricks.append((box, roi_brick_s5, brick_labels))

        scaled_shape = brick_shape // (2**scale)
        logger.info(
            f"Prepared {len(boxes_and_roi_bricks)} bricks of scale-0 shape "
            f"{(*brick_shape[::-1],)} ({(*scaled_shape[::-1],)} at scale-{scale})"
        )

        all_stats = []
        batches = [*iter_batches(boxes_and_roi_bricks, options["batch-size"])]
        logger.info(f"Processing {len(batches)} batches")
        for i, batch_boxes_and_bricks in enumerate(batches):
            with Timer(f"Batch {i:02d}", logger):
                batch_stats = self._execute_batch(scale,
                                                  batch_boxes_and_bricks)
                all_stats.append(batch_stats)

        all_stats = pd.concat(all_stats, ignore_index=True)
        all_stats = all_stats.groupby(['body', 'roi_id'],
                                      as_index=False)['voxels'].sum()

        roi_names = pd.Series(["<none>", *options["rois"]], name='roi')
        roi_names.index.name = 'roi_id'
        all_stats = all_stats.merge(roi_names, 'left', on='roi_id')
        all_stats = all_stats.sort_values(['body', 'roi_id'])

        if scale > 0:
            all_stats.rename(columns={'voxels': f'voxels_s{scale}'},
                             inplace=True)

        with Timer(f"Writing stats ({len(all_stats)} rows)", logger):
            np.save('roi-stats.npy', all_stats.to_records(index=False))
            all_stats.to_csv('roi-stats.csv', index=False, header=True)