def _run_inference(blocking, block_list, halo, ds_in, ds_out, mask, preprocess, predict, channel_mapping, n_threads): block_shape = blocking.blockShape dtypes = [dso.dtype for dso in ds_out] dtype = dtypes[0] assert all(dtp == dtype for dtp in dtypes) @dask.delayed def log1(block_id): fu.log("start processing block %i" % block_id) return block_id @dask.delayed def load_input(block_id): block = blocking.getBlock(block_id) # if we have a mask, check if this block is in mask if mask is not None: bb = vu.block_to_bb(block) bb_mask = mask[bb] if np.sum(bb_mask) == 0: return block_id, None return block_id, _load_input(ds_in, block.begin, block_shape, halo) @dask.delayed def preprocess_impl(inputs): block_id, data = inputs if data is None: return block_id, None data = preprocess(data) return block_id, data @dask.delayed def predict_impl(inputs): block_id, data = inputs if data is None: return block_id, None data = predict(data) return block_id, data @dask.delayed def write_output(inputs): block_id, output = inputs if output is None: return block_id out_shape = output.shape if len(out_shape) == 3: assert len(ds_out) == 1 bb = vu.block_to_bb(blocking.getBlock(block_id)) # check if we need to crop the output actual_shape = [b.stop - b.start for b in bb] if actual_shape != block_shape: block_bb = tuple(slice(0, ash) for ash in actual_shape) if output.ndim == 4: block_bb = (slice(None), ) + block_bb output = output[block_bb] # cast to uint8 if necessary if dtype == 'uint8': output = _to_uint8(output) # write the output to our output dataset(s) for dso, chann_mapping in zip(ds_out, channel_mapping): chan_start, chan_stop = chann_mapping if dso.ndim == 3: assert chan_stop - chan_start == 1 out_bb = bb else: assert output.ndim == 4 assert chan_stop - chan_start == dso.shape[0] out_bb = (slice(None), ) + bb if output.ndim == 4: outp = output[chan_start:chan_stop].squeeze() dso[out_bb] = outp return block_id @dask.delayed def log2(block_id): fu.log_block_success(block_id) return 1 # iterate over the blocks in block list, get the input data and predict results = [] for block_id in block_list: res = tz.pipe(block_id, log1, load_input, preprocess_impl, predict_impl, write_output, log2) results.append(res) success = dask.compute(*results, scheduler='threads', num_workers=n_threads) fu.log('Finished prediction for %i blocks' % sum(success))
def _ds_block(blocking, block_id, ds_in, ds_out, scale_factor, halo, sampler): fu.log("start processing block %i" % block_id) # load the block (output dataset / downsampled) coordinates if halo is None: block = blocking.getBlock(block_id) local_bb = np.s_[:] in_bb = vu.block_to_bb(block) out_bb = vu.block_to_bb(block) out_shape = block.shape else: halo_ds = [ha // scale_factor for ha in halo] if isinstance(scale_factor, int) else\ [ha // sf for sf, ha in zip(scale_factor, halo)] block = blocking.getBlockWithHalo(block_id, halo_ds) in_bb = vu.block_to_bb(block.outerBlock) out_bb = vu.block_to_bb(block.innerBlock) local_bb = vu.block_to_bb(block.innerBlockLocal) out_shape = block.outerBlock.shape # check if we have channels ndim = ds_in.ndim in_shape = ds_in.shape if ndim == 4: in_shape = in_shape[1:] # upsample the input bounding box if isinstance(scale_factor, int): in_bb = tuple(slice(ib.start * scale_factor, min(ib.stop * scale_factor, sh)) for ib, sh in zip(in_bb, in_shape)) else: in_bb = tuple(slice(ib.start * sf, min(ib.stop * sf, sh)) for ib, sf, sh in zip(in_bb, scale_factor, in_shape)) # load the input if ndim == 4: in_bb = (slice(None),) + in_bb out_bb = (slice(None),) + out_bb local_bb = (slice(None),) + local_bb x = ds_in[in_bb] # don't sample empty blocks if np.sum(x != 0) == 0: fu.log_block_success(block_id) return dtype = x.dtype if np.dtype(dtype) != np.dtype('float32'): x = x.astype('float32') if ndim == 4: n_channels = x.shape[0] out = np.zeros((n_channels,) + tuple(out_shape), dtype=dtype) for c in range(n_channels): out[c] = _ds_vol(x[c], out_shape, sampler, scale_factor, dtype) else: out = _ds_vol(x, out_shape, sampler, scale_factor, dtype) try: ds_out[out_bb] = out[local_bb] except IndexError: raise(IndexError("%s, %s, %s" % (str(out_bb), str(local_bb), str(out.shape)))) # log block success fu.log_block_success(block_id)
def merge_uniques(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) n_jobs = config['n_jobs'] tmp_folder = config['tmp_folder'] n_threads = config['threads_per_job'] output_path = config['output_path'] output_key = config['output_key'] def _read_input(job_id): return np.load( os.path.join(tmp_folder, 'find_uniques_job_%i.npy' % job_id)) fu.log("read uniques") with futures.ThreadPoolExecutor(n_threads) as tp: tasks = [tp.submit(_read_input, job_id) for job_id in range(n_jobs)] uniques = np.concatenate([t.result() for t in tasks]) fu.log("compute uniques") uniques = np.unique(uniques) fu.log("found %i unique values" % len(uniques)) fu.log("saving results to %s/%s" % (output_path, output_key)) with vu.file_reader(output_path) as f: chunk_size = min(int(1e6), len(uniques)) chunks = (chunk_size, ) ds = f.create_dataset(output_key, shape=uniques.shape, dtype='uint64', compression='gzip', chunks=chunks) ds.n_threads = n_threads ds[:] = uniques # log success fu.log_job_success(job_id)
def _read_subresults(ds_results, block_node_prefix, blocking, block_list, n_threads, initial_node_labeling=None): def read_subres(block_id): block = blocking.getBlock(block_id) # load nodes corresponding to this block block_path = block_node_prefix + str(block_id) nodes = ndist.loadNodes(block_path) # load the sub result for this block chunk = tuple(beg // bs for beg, bs in zip(block.begin, blocking.blockShape)) subres = ds_results.read_chunk(chunk) # subres is None -> this block has ignore label # and has no edgees. Note that this does not imply that the # block ONLY has ignore label (or only one ordinary node) # because multiple ordinary nodes could be seperated by the ignore label # and thus not share an edge. if subres is None: assert 0 in nodes return None assert len(nodes) == len( subres), "block %i: %i, %i" % (block_id, len(nodes), len(subres)) return nodes, subres, int(subres.max()) + 1 with futures.ThreadPoolExecutor(n_threads) as tp: tasks = [tp.submit(read_subres, block_id) for block_id in block_list] results = [t.result() for t in tasks] # filter and get results block_list = [ block_id for block_id, res in zip(block_list, results) if res is not None ] block_nodes = [res[0] for res in results if res is not None] block_res = [res[1] for res in results if res is not None] block_offsets = np.array([res[2] for res in results if res is not None], dtype='uint64') # get the offsets and add them to the block results to make these unique block_offsets = np.roll(block_offsets, 1) block_offsets[0] = 0 block_offsets = np.cumsum(block_offsets) block_res = [bres + boff for bres, boff in zip(block_res, block_offsets)] # apply the node labeling if initial_node_labeling is not None: fu.log("Apply initial node labeling to block nodes") block_nodes = [initial_node_labeling[nodes] for nodes in block_nodes] # construct result dicts for each block # keep zero mapped to zero block_results = [{ node_id: res_id if node_id != 0 else 0 for node_id, res_id in zip(bnodes, bres) } for bnodes, bres in zip(block_nodes, block_res)] return block_list, block_results
def remove_noise_objects(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) input_path = config['input_path'] input_key = config['input_key'] graph_path = config['graph_path'] graph_key = config['graph_key'] output_path = config['output_path'] output_key = config['output_key'] output_graph_path = config['output_graph_path'] output_graph_key = config['output_graph_key'] with vu.file_reader(input_path) as f: input = f[input_key][:] with vu.file_reader(graph_path) as f: nodes = f[graph_key]['nodes'][:] edges = f[graph_key]['edges'][:] # First, remove all components except the largest one bg_candidates = [input[0,0,0], input[0,0,-1], input[0,-1,0], input[0,-1,-1], input[-1,0,0], input[-1,0,-1], input[-1,-1,0], input[-1,-1,-1]] bg_id = max(bg_candidates, key=bg_candidates.count) fu.log("Background id is %d" % bg_id) adj_m = np.zeros((len(nodes), len(nodes))) for edge in edges: if edge[0] != bg_id and edge[1] != bg_id: adj_m[edge[0], edge[1]] = 1 n_comp, comp = scipy.sparse.csgraph.connected_components(adj_m, directed=False, return_labels=True) fu.log("Found %d connected components" % n_comp) max_size = 0 max_comp = 0 for component in set(comp): if component != bg_id: compsize = 0 for label in nodes[comp==component]: compsize += (input == label).sum() if compsize > max_size: max_size = compsize max_comp = component to_remove = nodes[comp!=max_comp] for rc in to_remove: fu.log("Cleaning up id %d" % rc) if rc != bg_id: input[input==rc] = bg_id nodes[nodes==rc] = bg_id edges[edges==rc] = bg_id # Remove duplicates from graph nodes = np.unique(nodes) edge_new = [] for edge in edges: if edge[0] == edge[1]: pass elif edge[1] < edge[0]: edge_new.append([edge[1], edge[0]]) else: edge_new.append([edge[0], edge[1]]) edges = np.array(edge_new, dtype=edge.dtype) edges = np.unique(edge_new, axis=0) # Now do the size based merging, as a method to compensate for some leftover oversegmentation # Skip this step if we have too few objects (since that means we are early in the development # of the embryo, and there are these small polar bodies, which we dont want to merge with the cells) if len(nodes) >= 10: sizes = [] for node in nodes: sizes.append((input==node).sum()) size_median = np.median(sizes) min_cell_size = size_median * 0.3 sizes = np.array(sizes) node_sorted = np.argsort(sizes) for i in range(len(node_sorted)): node = node_sorted[i] if sizes[node] < min_cell_size: fu.log('Cell %d is too small '%node) neighbor_edges = edges[np.logical_or(edges[:,0]==node, edges[:,1]==node)] neighbors = neighbor_edges[neighbor_edges != node].reshape(-1) if len(neighbors)>0: smallest_neighbor_i = np.argmin(sizes[neighbors]) smallest_neighbor = neighbors[smallest_neighbor_i] input[input==node] = smallest_neighbor sizes[smallest_neighbor] += sizes[node] sizes[node] = 0 node_sorted = np.argsort(sizes) edges[edges==node] = smallest_neighbor fu.log('Merging %d with %d'% (node, smallest_neighbor)) with vu.file_reader(output_path,'w') as f: ds = f.require_dataset(output_key, dtype='uint32', shape=input.shape, compression='gzip') ds[:] = input with vu.file_reader(output_graph_path,'w') as f: ds = f.require_dataset(output_graph_key+"/nodes", dtype='uint32', shape=nodes.shape, compression='gzip') ds[:] = nodes ds = f.require_dataset(output_graph_key+"/edges", dtype='uint32', shape=edges.shape, compression='gzip') ds[:] = edges fu.log_job_success(job_id)
def solve_lifted_global(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) # path to the reduced problem problem_path = config['problem_path'] # path where the node labeling shall be written assignment_path = config['assignment_path'] assignment_key = config['assignment_key'] lifted_prefix = config['lifted_prefix'] scale = config['scale'] agglomerator_key = config['agglomerator'] n_threads = config['threads_per_job'] time_limit = config.get('time_limit_solver', None) fu.log("using agglomerator %s" % agglomerator_key) solver = get_lifted_multicut_solver(agglomerator_key) with vu.file_reader(problem_path) as f: group = f['s%i' % scale] graph_group = group['graph'] if scale == 0 else group['graph_lmc'] ignore_label = graph_group.attrs['ignore_label'] ds = graph_group['edges'] ds.n_threads = n_threads uv_ids = ds[:] n_edges = len(uv_ids) n_nodes = int(uv_ids.max()) + 1 if scale > 0: ds = group['node_labeling_lmc'] ds.n_threads = n_threads initial_node_labeling = ds[:] ds = group['costs'] if scale == 0 else group['costs_lmc'] ds.n_threads = n_threads costs = ds[:] assert len(costs) == n_edges, "%i, %i" (len(costs), n_edges) ds = group['lifted_nh_%s' % lifted_prefix] ds.n_threads = n_threads lifted_uvs = ds[:] ds = group['lifted_costs_%s' % lifted_prefix] ds.n_threads = n_threads lifted_costs = ds[:] graph = nifty.graph.undirectedGraph(n_nodes) graph.insertEdges(uv_ids) fu.log("start agglomeration") node_labeling = solver(graph, costs, lifted_uvs, lifted_costs, n_threads=n_threads, time_limit=time_limit) fu.log("finished agglomeration") if scale > 0: # get the labeling of initial nodes initial_node_labeling = node_labeling[initial_node_labeling] else: initial_node_labeling = node_labeling n_nodes = len(initial_node_labeling) # make sure zero is mapped to 0 if we have an ignore label if ignore_label and initial_node_labeling[0] != 0: new_max_label = int(node_labeling.max() + 1) initial_node_labeling[initial_node_labeling == 0] = new_max_label initial_node_labeling[0] = 0 # make node labeling consecutive vigra.analysis.relabelConsecutive(initial_node_labeling, start_label=1, keep_zeros=True, out=initial_node_labeling) # write assignments node_shape = (n_nodes, ) chunks = (min(n_nodes, 524288), ) with vu.file_reader(assignment_path) as f: ds = f.require_dataset(assignment_key, dtype='uint64', shape=node_shape, chunks=chunks, compression='gzip') ds.n_threads = n_threads ds[:] = initial_node_labeling fu.log('saving results to %s:%s' % (assignment_path, assignment_key)) fu.log_job_success(job_id)
def _apply_node_labels(costs, uv_ids, mode, labels, max_repulsive, max_attractive): # TODO for now we assume binary node labeling, # but of course we could also have something more fancy with # multiple label ids n_nodes = len(labels) max_node_id = int(uv_ids.max()) assert max_node_id + 1 <= n_nodes, "%i, %i" % (max_node_id, n_nodes) with_label = np.arange(n_nodes, dtype='uint64')[labels > 0] fu.log("number of nodes with label %i / %i" % (len(with_label), n_nodes)) if mode == 'ignore': fu.log("Node-label mode: ignore") # ignore mode: set all edges that connect to a node with label to max repulsive edges_with_label = np.isn(uv_ids, with_label) edges_with_label = edges_with_label.any(axis=1) costs[edges_with_label] = max_repulsive elif mode == 'isolate': # isolate mode: set all edges that connect to a node with label to node without label to max repulsive fu.log("Node-label mode: isolate") # ignore mode: set all edges that connect two node with label to max attractive edges_with_label = np.in1d(uv_ids, with_label).reshape(uv_ids.shape) label_sum = edges_with_label.sum(axis=1) att_edges = label_sum == 2 rep_edges = label_sum == 1 fu.log("number of attractive edges: %i / %i" % (att_edges.sum(), len(att_edges))) fu.log("number of repulsive edges: %i / %i" % (rep_edges.sum(), len(rep_edges))) costs[att_edges] = max_attractive costs[rep_edges] = max_repulsive elif mode == 'ignore_transition': fu.log("Node-label mode: ignore_transition") labels_mapped_to_edges = labels[uv_ids] transition = labels_mapped_to_edges[:, 0] != labels_mapped_to_edges[:, 1] costs[transition] = max_repulsive fu.log("number of repulsive edges: %i / %i" % (transition.sum(), len(transition))) else: raise RuntimeError("Invalid label mode: %s" % mode) return costs
def background_size_filter(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # read the config with open(config_path) as f: config = json.load(f) input_path = config['input_path'] input_key = config['input_key'] output_path = config['output_path'] output_key = config['output_key'] block_list = config['block_list'] block_shape = config['block_shape'] res_path = config['res_path'] # get the shape with vu.file_reader(input_path, 'r') as f: ds = f[input_key] shape = f[input_key].shape blocking = nt.blocking(roiBegin=[0, 0, 0], roiEnd=list(shape), blockShape=list(block_shape)) discard_ids = np.load(res_path) fu.log("Discarding %i ids" % len(discard_ids)) same_file = input_path == output_path in_place = same_file and input_key == output_key if in_place: with vu.file_reader(input_path) as f: ds = f[input_key] [ apply_block(block_id, blocking, ds, ds, discard_ids) for block_id in block_list ] elif same_file: with vu.file_reader(input_path) as f: ds_in = f[input_key] ds_out = f[output_key] [ apply_block(block_id, blocking, ds_in, ds_out, discard_ids) for block_id in block_list ] else: with vu.file_reader(input_path, 'r') as f_in, vu.file_reader(output_path) as f_out: ds_in = f_in[input_key] ds_out = f_out[output_key] [ apply_block(block_id, blocking, ds_in, ds_out, discard_ids) for block_id in block_list ] # copy the 'maxId' attribute if present if job_id == 0 and not in_place: with vu.file_reader(input_path, 'r') as f: attrs = f[input_key].attrs max_id = attrs.get('maxId', None) if max_id is not None: with vu.file_reader(output_path) as f: f[output_key].attrs['maxId'] = max_id fu.log_job_success(job_id)
def _mws_block_pass2(block_id, blocking, ds_in, ds_out, mask, offsets, strides, randomize_strides, halo, noise_level, max_block_id, tmp_folder): fu.log("(Pass2) start processing block %i" % block_id) block = blocking.getBlockWithHalo(block_id, halo) in_bb = vu.block_to_bb(block.outerBlock) if mask is None: # if we don't have a mask, initialize with fully 'in-mask' volume # bb_mask = np.ones(tuple(b.stop - b.begin for b in in_bb), # dtype='bool') bb_mask = None else: bb_mask = mask[in_bb].astype('bool') if np.sum(bb_mask) == 0: fu.log_block_success(block_id) return # TODO does this make sense ? # set the mask for parts of indirect neighbor blocks # (which are also part of pass 2) to 0 # bb_mask = mask_corners(bb_mask, halo) aff_bb = (slice(None), ) + in_bb affs = vu.normalize(ds_in[aff_bb]) # load seeds seeds = ds_out[in_bb] seed_ids = np.unique(seeds) if seed_ids[0] == 0: seed_ids = seed_ids[1:] # load the serialized state for the neighboring (pass1) blocks # and find relevant edges between seed ids seed_edges = [] seed_edge_weights = [] attractive_mask = [] for axis in range(3): for to_lower in (False, True): ngb_id = blocking.getNeighborId(block_id, axis, to_lower) # get path to state serialization and check if it exists save_path = os.path.join(tmp_folder, 'seg_state_block%i.h5' % ngb_id) if not os.path.exists(save_path): continue with vu.file_reader(save_path) as f: # first, load the edges and see if they have overlap with our seed ids ngb_edges = f['edges'][:] ngb_edge_mask = np.in1d(ngb_edges, seed_ids).reshape(ngb_edges.shape) ngb_edge_mask = ngb_edge_mask.all(axis=1) # if we have edges, load the corresponding weights # and attractive / repulsive state if ngb_edge_mask.sum() > 0: ngb_edges = ngb_edges[ngb_edge_mask] ngb_weights = f['weights'][:][ngb_edge_mask] ngb_attractive_edges = f['attractive_edge_mask'][:][ ngb_edge_mask] seed_edges.append(ngb_edges) seed_edge_weights.append(ngb_weights) attractive_mask.append(ngb_attractive_edges) seed_edges = np.concatenate(seed_edges, axis=0) seed_edge_weights = np.concatenate(seed_edge_weights) attractive_mask = np.concatenate(attractive_mask) assert len(seed_edges) == len(seed_edge_weights) == len(attractive_mask) repulsive_mask = np.logical_not(attractive_mask) attractive_edges, repulsive_edges = seed_edges[ attractive_mask], seed_edges[repulsive_mask] attractive_weights, repulsive_weights = seed_edge_weights[ attractive_mask], seed_edge_weights[repulsive_mask] # run mws segmentation with seeds seed_state = { 'attractive': (attractive_edges, attractive_weights), 'repulsive': (repulsive_edges, repulsive_weights) } seg, grid_graph = mutex_watershed_with_seeds( affs, offsets, seeds, strides=strides, mask=bb_mask, randomize_strides=randomize_strides, noise_level=noise_level, return_graph=True, seed_state=seed_state) # offset with lowest block coordinate offset_id = block_id * np.prod(blocking.blockShape) vigra.analysis.relabelConsecutive(seg, start_label=offset_id, keep_zeros=True, out=seg) # find assignment of seed ids to segmentation ids assignments = grid_graph.get_seed_assignments_from_node_labels( seg.flatten()) # get the cropped segmentation local_bb = vu.block_to_bb(block.innerBlockLocal) seg_crop = seg[local_bb] # filter the assignments from ids that are not in the crop crop_ids = np.unique(seg_crop) filter_mask = np.in1d(assignments[:, 1], crop_ids) assignments = assignments[filter_mask] # store assignments to tmp folder save_path = os.path.join( tmp_folder, 'mws_two_pass_assignments_block_%i.npy' % block_id) np.save(save_path, assignments) out_bb = vu.block_to_bb(block.innerBlock) ds_out[out_bb] = seg_crop # write max-id for the last block if block_id == max_block_id: _write_nlabels(ds_out, seg) # log block success fu.log_block_success(block_id)
def merge_lifted_problems(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) path = config['path'] prefixs = config['prefixs'] out_prefix = config['out_prefix'] n_threads = config.get('threads_per_job', 1) f = z5py.File(path) edge_root = 's0/lifted_nh_%s' cost_root = 's0/lifted_costs_%s' edges = [] costs = [] for prefix in prefixs: edge_key = edge_root % prefix cost_key = cost_root % prefix ds_edges = f[edge_key] ds_edges.n_threads = n_threads this_edges = ds_edges[:] ds_costs = f[cost_key] ds_costs.n_threads = n_threads this_costs = ds_costs[:] assert len(this_costs) == len(this_edges) edges.append(this_edges) costs.append(this_costs) # TODO would be cleaner to # - sort the edges again # - see if any of the edges are duplicate and add up costs if they are edges = np.concatenate(edges, axis=0) costs = np.concatenate(costs, axis=0) edge_out_key = edge_root % out_prefix edge_chunks = (min(len(edges), 100000), 2) ds_edges_out = f.require_dataset(edge_out_key, shape=edges.shape, compression='gzip', dtype=edges.dtype, chunks=edge_chunks) ds_edges_out.n_threads = n_threads ds_edges_out[:] = edges cost_out_key = cost_root % out_prefix cost_chunks = (min(len(costs), 100000), ) ds_costs_out = f.require_dataset(cost_out_key, shape=costs.shape, compression='gzip', dtype=costs.dtype, chunks=cost_chunks) ds_costs_out.n_threads = n_threads ds_costs_out[:] = costs fu.log_job_success(job_id)
def graph_watershed_assignments(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) # load from config assignment_path = config['assignment_path'] assignment_key = config['assignment_key'] problem_path = config['problem_path'] graph_key = config['graph_key'] features_key = config['features_key'] filter_nodes_path = config['filter_nodes_path'] output_path = config['output_path'] output_key = config['output_key'] relabel = config['relabel'] from_costs = config['from_costs'] n_threads = config.get('threads_per_job', 1) # load the uv-ids, features and assignments fu.log("Read features and edges from %s" % problem_path) with vu.file_reader(problem_path, 'r') as f: ds = f['%s/edges' % graph_key] ds.n_threads = n_threads uv_ids = ds[:] n_nodes = int(uv_ids.max()) + 1 ds = f[features_key] ds.n_threads = n_threads if ds.ndim == 2: features = ds[:, 0].squeeze() else: features = ds[:] if from_costs: minc = features.min() fu.log("Mapping costs with range %f to %f to range 0 to 1" % (minc, features.max())) features -= minc features /= features.max() features = 1. - features fu.log("Read assignments from %s" % assignment_path) with vu.file_reader(assignment_path, 'r') as f: ds = f[assignment_key] ds.n_threads = n_threads chunks = ds.chunks assignments = ds[:] assert n_nodes == len(assignments),\ "Expected number of nodes %i and number of assignments %i does not agree" % (n_nodes, len(assignments)) seed_offset = int(assignments.max()) + 1 # load the discard ids discard_ids = np.load(filter_nodes_path) assert 0 not in discard_ids, "Breaks logic" # build the new graph graph = nifty.graph.undirectedGraph(n_nodes) graph.insertEdges(uv_ids) # run graph watershed to get the new assignments # map zero label to new id assignments[assignments == 0] = seed_offset discard_mask = np.in1d(assignments, discard_ids) assignments[discard_mask] = 0 n_discard = int(discard_mask.sum()) fu.log("Discarding %i / %i fragments" % (n_discard, assignments.size)) fu.log("Start grah watershed") assignments = nifty.graph.edgeWeightedWatershedsSegmentation( graph, assignments, features) fu.log("Finished graph watershed") assignments[assignments == seed_offset] = 0 if relabel: max_id = vigra.analysis.relabelConsecutive(assignments, out=assignments, start_label=1, keep_zeros=True)[1] fu.log("Max-id after relabeling: %i (before was %i)" % (max_id, seed_offset - 1)) with vu.file_reader(output_path) as f: ds = f.require_dataset(output_key, shape=assignments.shape, chunks=chunks, compression='gzip', dtype='uint64') ds.n_threads = n_threads ds[:] = assignments fu.log_job_success(job_id)
def find_labeling(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) n_jobs = config['n_jobs'] tmp_folder = config['tmp_folder'] input_path = config['input_path'] input_key = config['input_key'] n_threads = config['threads_per_job'] assignment_path = config['assignment_path'] def _read_input(job_id): return np.load( os.path.join(tmp_folder, 'find_uniques_job_%i.npy' % job_id)) # TODO this could be parallelized fu.log("read uniques") with futures.ThreadPoolExecutor(n_threads) as tp: tasks = [tp.submit(_read_input, job_id) for job_id in range(n_jobs)] uniques = np.concatenate([t.result() for t in tasks]) fu.log("compute uniques") # uniques = nt.unique(uniques) uniques = np.unique(uniques) fu.log("relabel") _, max_id, mapping = vigra.analysis.relabelConsecutive(uniques, keep_zeros=True, start_label=1) fu.log("saving results to %s" % assignment_path) with open(assignment_path, 'wb') as f: pickle.dump(mapping, f) # log success fu.log_job_success(job_id)
def inference(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) input_path = config['input_path'] input_key = config['input_key'] output_path = config['output_path'] checkpoint_path = config['checkpoint_path'] block_shape = config['block_shape'] block_list = config['block_list'] halo = config['halo'] framework = config['framework'] n_threads = config['threads_per_job'] fu.log("run iference with framework %s, with %i threads" % (framework, n_threads)) output_keys = config['output_keys'] channel_mapping = config['channel_mapping'] if config.get('set_visible_device', False): os.environ['CUDA_VISIBLE_DEVICES'] = str(job_id) fu.log("setting cuda visible devices to %i" % job_id) gpu = 0 fu.log("Loading model from %s" % checkpoint_path) predict = get_predictor(framework)(checkpoint_path, halo, gpu=gpu) fu.log("Have model") preprocess = get_preprocessor(framework) shape = vu.get_shape(input_path, input_key) blocking = nt.blocking(roiBegin=[0, 0, 0], roiEnd=list(shape), blockShape=list(block_shape)) with vu.file_reader(input_path, 'r') as f_in, vu.file_reader(output_path) as f_out: ds_in = f_in[input_key] ds_out = [f_out[key] for key in output_keys] if 'mask_path' in config: mask = vu.load_mask(config['mask_path'], config['mask_key'], shape) else: mask = None _run_inference(blocking, block_list, halo, ds_in, ds_out, mask, preprocess, predict, channel_mapping, n_threads) fu.log_job_success(job_id)
def log1(block_id): fu.log("start processing block %i" % block_id) return block_id
def reduce_problem(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) problem_path = config['problem_path'] initial_block_shape = config['block_shape'] scale = config['scale'] block_list = config['block_list'] accumulation_method = config.get('accumulation_method', 'sum') n_threads = config['threads_per_job'] roi_begin = config.get('roi_begin', None) roi_end = config.get('roi_end', None) # get the number of nodes and uv-ids at this scale level # as well as the initial node labeling fu.log("read problem from %s" % problem_path) graph_key = 's%i/graph' % scale with vu.file_reader(problem_path, 'r') as f: # load graph nodes and edges group = f[graph_key] shape = group.attrs['shape'] # nodes # we only need to load the nodes for scale 0 # otherwise, we already know that they are consecutive if scale == 0: ds = group['nodes'] ds.n_threads = n_threads nodes = ds[:] n_nodes = len(nodes) else: n_nodes = group.attrs['numberOfNodes'] nodes = np.arange(n_nodes, dtype='uint64') # edges ds = group['edges'] ds.n_threads = n_threads uv_ids = ds[:] n_edges = len(uv_ids) # read initial node labeling if scale == 0: initial_node_labeling = None else: ds = f['s%i/node_labeling' % scale] ds.n_threads = n_threads initial_node_labeling = ds[:] costs_key = 's%i/costs' % scale with vu.file_reader(problem_path) as f: ds = f[costs_key] ds.n_threads = n_threads costs = ds[:] assert len(costs) == n_edges, "%i, %i" (len(costs), n_edges) block_shape = [bsh * 2**scale for bsh in initial_block_shape] blocking = nt.blocking([0, 0, 0], shape, block_shape) # get the new node assignment fu.log("merge nodes") n_new_nodes, node_labeling, new_initial_node_labeling = _merge_nodes( problem_path, scale, blocking, block_list, nodes, uv_ids, initial_node_labeling, n_threads) # get the new edge assignment fu.log("get new edge ids") new_uv_ids, edge_labeling, new_costs = _get_new_edges( uv_ids, node_labeling, costs, accumulation_method, n_threads) # serialize the input graph and costs for the next scale level fu.log("serialize new problem to %s/s%i" % (problem_path, scale + 1)) n_new_edges = _serialize_new_problem(problem_path, n_new_nodes, new_uv_ids, node_labeling, edge_labeling, new_costs, new_initial_node_labeling, shape, scale, initial_block_shape, n_threads, roi_begin, roi_end) fu.log("Reduced graph from %i to %i nodes; %i to %i edges." % (n_nodes, n_new_nodes, n_edges, n_new_edges)) fu.log_job_success(job_id)
def merge_offsets(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) tmp_folder = config['tmp_folder'] n_jobs = config['n_jobs'] save_path = config['save_path'] n_blocks = config['n_blocks'] save_prefix = config['save_prefix'] offsets = {} for block_job_id in range(n_jobs): path = os.path.join(tmp_folder, '%s_%i.json' % (save_prefix, block_job_id)) with open(path, 'r') as f: offsets.update(json.load(f)) os.remove(path) # NOTE: the block-id keys in 'offsets' are stored as str, so we can't just use # 'sorted(offsets.items())' because it would string-sort! blocks = list(map(int, list(offsets.keys()))) offset_list = list(offsets.values()) assert len(blocks) == len(offset_list) == n_blocks fu.log("merging offsets for %i blocks" % n_blocks) key_sort = np.argsort(blocks) offset_list = np.array([offset_list[k] for k in key_sort], dtype='uint64') last_offset = offset_list[-1] empty_blocks = np.where(offset_list == 0)[0].tolist() offset_list = np.roll(offset_list, 1) offset_list[0] = 0 offset_list = np.cumsum(offset_list).tolist() assert len(offset_list) == n_blocks, "%i, %i" % (len(offset_list), n_blocks) n_labels = offset_list[-1] + last_offset + 1 fu.log("number of empty blocks: %i / %i" % (len(empty_blocks), n_blocks)) fu.log("total number of labels: %i" % n_labels) fu.log("dumping offsets to %s" % save_path) with open(save_path, 'w') as f: json.dump({'offsets': offset_list, 'empty_blocks': empty_blocks, 'n_labels': n_labels}, f) fu.log_job_success(job_id)
def agglomerative_clustering(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) # path to the reduced problem problem_path = config['problem_path'] # path where the node labeling shall be written assignment_path = config['assignment_path'] assignment_key = config['assignment_key'] features_path = config['features_path'] features_key = config['features_key'] threshold = config['threshold'] n_threads = config['threads_per_job'] scale = 0 with vu.file_reader(problem_path) as f: group = f['s%i' % scale] graph_group = group['graph'] ignore_label = graph_group.attrs['ignoreLabel'] ds = graph_group['edges'] ds.n_threads = n_threads uv_ids = ds[:] n_edges = len(uv_ids) with vu.file_reader(features_path) as f: ds = f[features_key] ds.n_threads = n_threads edge_features = ds[:, 0].squeeze() edge_sizes = ds[:, -1].squeeze() assert len(edge_features) == n_edges n_nodes = int(uv_ids.max()) + 1 fu.log("creating graph with %i nodes an %i edges" % (n_nodes, len(uv_ids))) graph = nifty.graph.undirectedGraph(n_nodes) graph.insertEdges(uv_ids) fu.log("start agglomeration") # TODO also support vanilla agglomerative clustering node_labeling = su.mala_clustering(graph, edge_features, edge_sizes, threshold) fu.log("finished agglomeration") n_nodes = len(node_labeling) # make sure zero is mapped to 0 if we have an ignore label if ignore_label and node_labeling[0] != 0: new_max_label = int(node_labeling.max() + 1) node_labeling[node_labeling == 0] = new_max_label node_labeling[0] = 0 node_shape = (n_nodes, ) chunks = (min(n_nodes, 524288), ) with vu.file_reader(assignment_path) as f: ds = f.require_dataset(assignment_key, dtype='uint64', shape=node_shape, chunks=chunks, compression='gzip') ds.n_threads = n_threads ds[:] = node_labeling fu.log('saving results to %s:%s' % (assignment_path, assignment_key)) fu.log_job_success(job_id)
def solve_subproblems(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) # input configs problem_path = config['problem_path'] scale = config['scale'] block_shape = config['block_shape'] block_list = config['block_list'] n_threads = config['threads_per_job'] agglomerator_key = config['agglomerator'] time_limit = config.get('time_limit_solver', None) fu.log("reading problem from %s" % problem_path) problem = z5py.N5File(problem_path) shape = problem.attrs['shape'] # load the costs costs_key = 's%i/costs' % scale fu.log("reading costs from path in problem: %s" % costs_key) ds = problem[costs_key] ds.n_threads = n_threads costs = ds[:] # load the graph graph_key = 's%i/graph' % scale fu.log("reading graph from path in problem: %s" % graph_key) graph = ndist.Graph(os.path.join(problem_path, graph_key), numberOfThreads=n_threads) uv_ids = graph.uvIds() # check if the problem has an ignore-label ignore_label = problem[graph_key].attrs['ignoreLabel'] fu.log("ignore label is %s" % ('true' if ignore_label else 'false')) fu.log("using agglomerator %s" % agglomerator_key) agglomerator = su.key_to_agglomerator(agglomerator_key) # the output group out = problem['s%i/sub_results' % scale] # TODO this should be a n5 varlen dataset as well and # then this is just another dataset in problem path block_prefix = os.path.join(problem_path, 's%i' % scale, 'sub_graphs', 'block_') blocking = nt.blocking([0, 0, 0], shape, list(block_shape)) with futures.ThreadPoolExecutor(n_threads) as tp: tasks = [ tp.submit(_solve_block_problem, block_id, graph, uv_ids, block_prefix, costs, agglomerator, ignore_label, blocking, out, time_limit) for block_id in block_list ] [t.result() for t in tasks] fu.log_job_success(job_id)
def _accumulate_block(block_id, blocking, ds_in, ds_labels, out_prefix, graph_block_prefix, blocks_prefix, filters, sigmas, halo, ignore_label, apply_in_2d, channel_agglomeration): fu.log("start processing block %i" % block_id) # load graph and check if this block has edges graph = ndist.Graph(graph_block_prefix + str(block_id)) if graph.numberOfEdges == 0: fu.log("block %i has no edges" % block_id) fu.log_block_success(block_id) return shape = ds_labels.shape # get the bounding if sum(halo) > 0: block = blocking.getBlockWithHalo(block_id, halo) block_shape = block.outerBlock.shape bb_in = vu.block_to_bb(block.outerBlock) bb = vu.block_to_bb(block.innerBlock) bb_local = vu.block_to_bb(block.innerBlockLocal) # increase inner bounding box by 1 in posirive direction # in accordance with the graph extraction bb = tuple( slice(b.start, min(b.stop + 1, sh)) for b, sh in zip(bb, shape)) bb_local = tuple( slice(b.start, min(b.stop + 1, bsh)) for b, bsh in zip(bb_local, block_shape)) else: block = blocking.getBlock(block_id) bb = vu.block_to_bb(block) bb = tuple( slice(b.start, min(b.stop + 1, sh)) for b, sh in zip(bb, shape)) bb_in = bb bb_local = slice(None) input_dim = ds_in.ndim # TODO make choice of channels optional if input_dim == 4: bb_in = (slice(0, 3), ) + bb_in input_ = vu.normalize(ds_in[bb_in]) if input_dim == 4: assert channel_agglomeration is not None input_ = getattr(np, channel_agglomeration)(input_, axis=0) # load labels labels = ds_labels[bb] # TODO pre-smoothing ?! # accumulate the edge features edge_features = [ _accumulate_filter(input_, graph, labels, bb_local, filter_name, sigma, ignore_label, filter_name == filters[-1] and sigma == sigmas[-1], apply_in_2d) for filter_name in filters for sigma in sigmas ] edge_features = np.concatenate(edge_features, axis=1) # save the features save_path = out_prefix + str(block_id) fu.log("saving feature result of shape %s to %s" % (str(edge_features.shape), save_path)) save_root, save_key = os.path.split(save_path) with z5py.N5File(save_root) as f: f.create_dataset(save_key, data=edge_features, chunks=edge_features.shape) fu.log_block_success(block_id)
def merge_assignments(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) output_path = config['output_path'] output_key = config['output_key'] tmp_folder = config['tmp_folder'] n_jobs = config['n_jobs'] offset_path = config['offset_path'] save_prefix = config['save_prefix'] with open(offset_path) as f: n_labels = int(json.load(f)['n_labels']) labels = np.arange(n_labels, dtype='uint64') # load and remove assignments assignments = [ np.load( os.path.join(tmp_folder, '%s_%i.npy' % (save_prefix, block_job_id))) for block_job_id in range(n_jobs) ] # for block_job_id in range(n_jobs): # os.remove(os.path.join(tmp_folder, # 'assignments_%i.npy' % block_job_id)) if all(ass.size for ass in assignments): assignments = np.concatenate(assignments, axis=0) assignments = np.unique(assignments, axis=0) assert assignments.shape[1] == 2 fu.log("have %i pairs of node assignments" % len(assignments)) have_assignments = True else: fu.log( "did not find any node assignments, label assignment will be identity" ) have_assignments = False ufd = nufd.boost_ufd(labels) if have_assignments: assert int(assignments.max()) + 1 <= n_labels, "%i, %i" % ( int(assignments.max()) + 1, n_labels) ufd.merge(assignments) label_assignments = ufd.find(labels) label_assignemnts, max_id, _ = vigra.analysis.relabelConsecutive( label_assignments, keep_zeros=True, start_label=1) assert len(label_assignments) == n_labels fu.log("reducing the number of labels from %i to %i" % (n_labels, max_id + 1)) chunks = (min(65334, n_labels), ) with vu.file_reader(output_path) as f: f.create_dataset(output_key, data=label_assignments, compression='gzip', chunks=chunks) fu.log_job_success(job_id)
def probs_to_costs(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) input_path = config['input_path'] input_key = config['input_key'] output_path = config['output_path'] output_key = config['output_key'] features_path = config['features_path'] features_key = config['features_key'] # config for cost transformations invert_inputs = config.get('invert_inputs', False) transform_to_costs = config.get('transform_to_costs', True) weight_edges = config.get('weight_edges', False) weighting_exponent = config.get('weighting_exponent', 1.) beta = config.get('beta', 0.5) # additional node labels node_labels = config.get('node_labels', None) n_threads = config['threads_per_job'] fu.log("reading input from %s:%s" % (input_path, input_key)) with vu.file_reader(input_path) as f: ds = f[input_key] ds.n_threads = n_threads # we might have 1d or 2d inputs, depending on input from features or random forest slice_ = slice(None) if ds.ndim == 1 else (slice(None), slice(0, 1)) costs = ds[slice_].squeeze() # normalize to range 0, 1 min_, max_ = costs.min(), costs.max() fu.log('input-range: %f %f' % (min_, max_)) fu.log('%f +- %f' % (costs.mean(), costs.std())) if invert_inputs: fu.log("inverting probability inputs") costs = 1. - costs if transform_to_costs: fu.log("converting probability inputs to costs") if weight_edges: fu.log("weighting edges by size") # the edge sizes are at the last feature index with vu.file_reader(features_path) as f: ds = f[features_key] n_features = ds.shape[1] ds.n_threads = n_threads edge_sizes = ds[:, n_features - 1:n_features].squeeze() else: fu.log("no edge weighting") edge_sizes = None costs = _transform_probabilities_to_costs( costs, beta=beta, edge_sizes=edge_sizes, weighting_exponent=weighting_exponent) # adjust edges of nodes with labels if given if node_labels is not None: fu.log("have node labels") max_repulsive = 5 * costs.min() max_attractive = 5 * costs.max() fu.log("maximally attractive edge weight %f" % max_attractive) fu.log("maximally repulsive edge weight %f" % max_repulsive) with vu.file_reader(features_path, 'r') as f: ds = f['s0/graph/edges'] ds.n_threads = n_threads uv_ids = ds[:] for mode, path_key in node_labels.items(): path, key = path_key fu.log("applying node labels with mode %s from %s:%s" % (mode, path, key)) with vu.file_reader(path, 'r') as f: ds = f[key] ds.n_threads = n_threads labels = ds[:] costs = _apply_node_labels(costs, uv_ids, mode, labels, max_repulsive, max_attractive) with vu.file_reader(output_path) as f: ds = f[output_key] ds.n_threads = n_threads ds[:] = costs fu.log_job_success(job_id)
def write(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("loading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) # read I/O config input_path = config['input_path'] input_key = config['input_key'] # check if we write in-place if 'output_path' in config: output_path = config['output_path'] output_key = config['output_key'] in_place = False else: in_place = True block_shape = config['block_shape'] block_list = config['block_list'] n_threads = config.get('threads_per_core', 1) allow_empty_assignments = config.get('allow_empty_assignments', False) # read node assignments assignment_path = config['assignment_path'] assignment_key = config.get('assignment_key', None) fu.log("loading node labels from %s" % assignment_path) node_labels = _load_assignments(assignment_path, assignment_key, n_threads) offset_path = config.get('offset_path', None) # if we write in-place, we only need to open one file and one dataset if in_place: with vu.file_reader(input_path) as f: ds_in = f[input_key] ds_out = ds_in shape = ds_in.shape blocking = nt.blocking([0, 0, 0], list(shape), list(block_shape)) if offset_path is None: _write(ds_in, ds_out, blocking, block_list, n_threads, node_labels, allow_empty_assignments) else: _write_with_offsets(ds_in, ds_out, blocking, block_list, n_threads, node_labels, offset_path, allow_empty_assignments) # write the max-label # for job 0 if job_id == 0: _write_maxlabel(input_path, input_key, node_labels) else: # even if we do not write in-place, we might still write to the same output_file, # but different datasets # hdf5 does not like opening a file twice, so we need to check for this if input_path == output_path: with vu.file_reader(input_path) as f: ds_in = f[input_key] ds_out = f[output_key] shape = ds_in.shape blocking = nt.blocking([0, 0, 0], list(shape), list(block_shape)) if offset_path is None: _write(ds_in, ds_out, blocking, block_list, n_threads, node_labels, allow_empty_assignments) else: _write_with_offsets(ds_in, ds_out, blocking, block_list, n_threads, node_labels, offset_path, allow_empty_assignments) else: with vu.file_reader( input_path, 'r') as f_in, vu.file_reader(output_path) as f_out: ds_in = f_in[input_key] ds_out = f_out[output_key] shape = ds_in.shape blocking = nt.blocking([0, 0, 0], list(shape), list(block_shape)) if offset_path is None: _write(ds_in, ds_out, blocking, block_list, n_threads, node_labels, allow_empty_assignments) else: _write_with_offsets(ds_in, ds_out, blocking, block_list, n_threads, node_labels, offset_path, allow_empty_assignments) # write the max-label # for job 0 if job_id == 0: _write_maxlabel(output_path, output_key, node_labels) fu.log_job_success(job_id)
def sub_solutions(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) # input configs problem_path = config['problem_path'] scale = config['scale'] block_shape = config['block_shape'] block_list = config['block_list'] n_threads = config['threads_per_job'] output_path = config['output_path'] output_key = config['output_key'] ws_path = config['ws_path'] ws_key = config['ws_key'] sub_result_identifier = config.get('sub_result_identifier', 'sub_results') sub_graph_identifier = config.get('sub_graph_identifier', 'sub_graphs') fu.log("reading problem from %s" % problem_path) problem = z5py.N5File(problem_path) shape = problem.attrs['shape'] blocking = nt.blocking([0, 0, 0], list(shape), list(block_shape)) # we need to project the ws labels back to the original labeling # for this, we first need to load the initial node labeling if scale > 1: node_label_key = 's%i/node_labeling' % scale fu.log("scale %i > 1; reading node labeling from %s" % (scale, node_label_key)) ds_node_labeling = problem[node_label_key] ds_node_labeling.n_threads = n_threads initial_node_labeling = ds_node_labeling[:] else: initial_node_labeling = None # read the sub results ds_results = problem['s%i/%s/node_result' % (scale, sub_result_identifier)] # TODO should be varlen dataset fu.log("reading subresults") block_node_prefix = os.path.join(problem_path, 's%i' % scale, sub_graph_identifier, 'block_') block_list, block_results = _read_subresults(ds_results, block_node_prefix, blocking, block_list, n_threads, initial_node_labeling) fu.log("writing subresults") # write the resulting segmentation with vu.file_reader(output_path) as f_out, vu.file_reader(ws_path, 'r') as f_in: ds_in = f_in[ws_key] ds_out = f_out[output_key] with futures.ThreadPoolExecutor(n_threads) as tp: tasks = [ tp.submit(_write_block_res, ds_in, ds_out, block_id, blocking, block_res) for block_id, block_res in zip(block_list, block_results) ] [t.result() for t in tasks] fu.log_job_success(job_id)
def _solve_block_problem(block_id, graph, uv_ids, block_prefix, costs, lifted_uvs, lifted_costs, lifted_agglomerator, agglomerator, ignore_label, blocking, out, time_limit): fu.log("Start processing block %i" % block_id) # load the nodes in this sub-block and map them # to our current node-labeling block_path = block_prefix + str(block_id) assert os.path.exists(block_path), block_path nodes = ndist.loadNodes(block_path) # if we have an ignore label, remove zero from the nodes # (nodes are sorted, so it will always be at pos 0) if ignore_label and nodes[0] == 0: nodes = nodes[1:] removed_ignore_label = True if len(nodes) == 0: fu.log_block_success(block_id) return else: removed_ignore_label = False # we allow for invalid nodes here, # which can occur for un-connected graphs resulting from bad masks ... inner_edges, outer_edges = graph.extractSubgraphFromNodes( nodes, allowInvalidNodes=True) # if we only have no inner edges, return # the outer edges as cut edges if len(inner_edges) == 0: if len(nodes) > 1: assert removed_ignore_label,\ "Can only have trivial sub-graphs for more than one node if we removed ignore label" cut_edge_ids = outer_edges sub_result = None fu.log("Block %i: has no inner edges" % block_id) # otherwise solve the multicut for this block else: # find the lifted uv-ids that correspond to the inner edges inner_lifted_edges = _find_lifted_edges(lifted_uvs, nodes) fu.log( "Block %i: Solving sub-block with %i nodes, %i edges and %i lifted edges" % (block_id, len(nodes), len(inner_edges), len(inner_lifted_edges))) sub_uvs = uv_ids[inner_edges] # relabel the sub-nodes and associated uv-ids for more efficient processing nodes_relabeled, max_id, mapping = vigra.analysis.relabelConsecutive( nodes, start_label=0, keep_zeros=False) sub_uvs = nt.takeDict(mapping, sub_uvs) n_local_nodes = max_id + 1 sub_graph = nifty.graph.undirectedGraph(n_local_nodes) sub_graph.insertEdges(sub_uvs) sub_costs = costs[inner_edges] assert len(sub_costs) == sub_graph.numberOfEdges # we only need to run lifted multicut if we have lifted edges in # the subgraph if len(inner_lifted_edges) > 0: fu.log( "Block %i: have lifted edges and use lifted multicut solver") sub_lifted_uvs = nt.takeDict(mapping, lifted_uvs[inner_lifted_edges]) sub_lifted_costs = lifted_costs[inner_lifted_edges] # solve multicut and relabel the result sub_result = lifted_agglomerator(sub_graph, sub_costs, sub_lifted_uvs, sub_lifted_costs, time_limit=time_limit) # otherwise we run normal multicut else: fu.log("Block %i: don't have lifted edges and use multicut solver") # solve multicut and relabel the result sub_result = agglomerator(sub_graph, sub_costs, time_limit=time_limit) assert len(sub_result) == len(nodes), "%i, %i" % (len(sub_result), len(nodes)) sub_edgeresult = sub_result[sub_uvs[:, 0]] != sub_result[sub_uvs[:, 1]] assert len(sub_edgeresult) == len(inner_edges) cut_edge_ids = inner_edges[sub_edgeresult] cut_edge_ids = np.concatenate([cut_edge_ids, outer_edges]) _, res_max_id, _ = vigra.analysis.relabelConsecutive(sub_result, start_label=1, keep_zeros=False, out=sub_result) fu.log("Block %i: Subresult has %i unique ids" % (block_id, res_max_id)) # IMPORTANT !!! # we can only add back the ignore label after getting the edge-result !!! if removed_ignore_label: sub_result = np.concatenate((np.zeros(1, dtype='uint64'), sub_result)) # get chunk id of this block block = blocking.getBlock(block_id) chunk_id = tuple(beg // sh for beg, sh in zip(block.begin, blocking.blockShape)) # serialize the cut-edge-ids and the (local) node labeling ds_edge_res = out['cut_edge_ids'] fu.log("Block %i: Serializing %i cut edges" % (block_id, len(cut_edge_ids))) ds_edge_res.write_chunk(chunk_id, cut_edge_ids, True) if sub_result is not None: ds_node_res = out['node_result'] fu.log("Block %i: Serializing %i node results" % (block_id, len(sub_result))) ds_node_res.write_chunk(chunk_id, sub_result, True) fu.log_block_success(block_id)
def transformix_coordinate(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # read the config with open(config_path) as f: config = json.load(f) input_path = config['input_path'] input_key = config['input_key'] output_path = config['output_path'] output_key = config['output_key'] transformation_file = config['transformation_file'] elastix_dir = config['elastix_directory'] tmp_folder = config['tmp_folder'] block_list = config['block_list'] block_shape = config['block_shape'] fu.log("Applying registration with:") fu.log("transformation_file: %s" % transformation_file) fu.log("elastix_directory: %s" % elastix_dir) transformix_bin = os.path.join(elastix_dir, 'bin', 'transformix') # set the ld library path lib_path = os.environ['LD_LIBRARY_PATH'] elastix_lib_path = os.path.join(elastix_dir, 'lib') os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{elastix_lib_path}" with open_file(input_path, 'r') as f_in, open_file(output_path, 'a') as f_out: ds_in = f_in[input_key] ds_out = f_out[output_key] shape = ds_out.shape blocking = nt.blocking([0, 0, 0], shape, block_shape) for block_id in block_list: fu.log("start processing block %i" % block_id) process_block(ds_in, ds_out, blocking, block_id, transformix_bin, transformation_file, tmp_folder) fu.log_block_success(block_id) fu.log_job_success(job_id)
def solve_lifted_subproblems(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) # input configs problem_path = config['problem_path'] scale = config['scale'] block_shape = config['block_shape'] block_list = config['block_list'] lifted_prefix = config['lifted_prefix'] agglomerator_key = config['agglomerator'] time_limit = config.get('time_limit_solver', None) n_threads = config.get('threads_per_job', 1) fu.log("reading problem from %s" % problem_path) problem = z5py.N5File(problem_path) shape = problem.attrs['shape'] # load the costs # NOTE we use different cost identifiers for multicut and lifted multicut # in order to run both in the same n5-container. # However, for scale level 0 the costs come from the CostsWorkflow and # hence the identifier is identical costs_key = 's%i/costs_lmc' % scale if scale > 0 else 's0/costs' fu.log("reading costs from path in problem: %s" % costs_key) ds = problem[costs_key] ds.n_threads = n_threads costs = ds[:] # load the graph # NOTE we use different graph identifiers for multicut and lifted multicut # in order to run both in the same n5-container. # However, for scale level 0 the graph comes from the GraphWorkflow and # hence the identifier is identical graph_key = 's%i/graph_lmc' % scale if scale > 0 else 's0/graph' fu.log("reading graph from path in problem: %s" % graph_key) graph = ndist.Graph(os.path.join(problem_path, graph_key), numberOfThreads=n_threads) uv_ids = graph.uvIds() # check if the problem has an ignore-label ignore_label = problem[graph_key].attrs['ignoreLabel'] fu.log("ignore label is %s" % ('true' if ignore_label else 'false')) fu.log("using agglomerator %s" % agglomerator_key) lifted_agglomerator = su.key_to_lifted_agglomerator(agglomerator_key) # TODO enable different multicut agglomerator agglomerator = su.key_to_agglomerator(agglomerator_key) # load the lifted edges and costs nh_key = 's%i/lifted_nh_%s' % (scale, lifted_prefix) lifted_costs_key = 's%i/lifted_costs_%s' % (scale, lifted_prefix) ds = problem[nh_key] fu.log("reading lifted uvs") ds.n_threads = n_threads lifted_uvs = ds[:] fu.log("reading lifted costs") ds = problem[lifted_costs_key] ds.n_threads = n_threads lifted_costs = ds[:] # the output group out = problem['s%i/sub_results_lmc' % scale] # NOTE we use different sub-graph identifiers for multicut and lifted multicut # in order to run both in the same n5-container. # However, for scale level 0 the sub-graphs come from the GraphWorkflow and # are hence identical sub_graph_identifier = 'sub_graphs' if scale == 0 else 'sub_graphs_lmc' block_prefix = os.path.join(problem_path, 's%i' % scale, sub_graph_identifier, 'block_') blocking = nt.blocking([0, 0, 0], shape, list(block_shape)) fu.log("start processsing %i blocks" % len(block_list)) with futures.ThreadPoolExecutor(n_threads) as tp: tasks = [ tp.submit(_solve_block_problem, block_id, graph, uv_ids, block_prefix, costs, lifted_uvs, lifted_costs, lifted_agglomerator, agglomerator, ignore_label, blocking, out, time_limit) for block_id in block_list ] [t.result() for t in tasks] fu.log_job_success(job_id)
def learn_rf(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) features_dict = config['features_dict'] labels_dict = config['labels_dict'] output_path = config['output_path'] n_threads = config['threads_per_job'] n_trees = config.get('n_trees', 100) features = [] labels = [] # TODO enable multiple feature paths # NOTE we assert that keys of boyh dicts are identical in the main class for key, feat_path in features_dict.items(): label_path = labels_dict[key] fu.log("reading featurs from %s:%s, labels from %s:%s" % tuple(feat_path + label_path)) with vu.file_reader(feat_path[0]) as f: ds = f[feat_path[1]] ds.n_threads = n_threads feats = ds[:] with vu.file_reader(label_path[0]) as f: ds = f[label_path[1]] ds.n_threads = n_threads label = ds[:] assert len(label) == len(feats) # check if we have an ignore label ignore_mask = label != -1 n_ignore = np.sum(ignore_mask) if n_ignore < ignore_mask.size: fu.log("removing %i examples due to ignore mask" % n_ignore) feats = feats[ignore_mask] label = label[ignore_mask] features.append(feats) labels.append(label) features = np.concatenate(features, axis=0) labels = np.concatenate(labels, axis=0) fu.log("start learning random forest with %i examples and %i features" % features.shape) rf = RandomForestClassifier(n_estimators=n_trees, n_jobs=n_threads) rf.fit(features, labels) fu.log("saving random forest to %s" % output_path) with open(output_path, 'wb') as f: pickle.dump(rf, f) fu.log_job_success(job_id)
def _merge_nodes(problem_path, scale, blocking, block_list, nodes, uv_ids, initial_node_labeling, n_threads): # load the cut edge ids n_edges = len(uv_ids) cut_edge_ids = _load_cut_edges(problem_path, scale, blocking, block_list, n_threads) assert len(cut_edge_ids) < n_edges, "%i = %i, does not reduce problem" % ( len(cut_edge_ids), n_edges) merge_edges = np.ones(n_edges, dtype='bool') merge_edges[cut_edge_ids] = False fu.log('merging %i / %i edges' % (np.sum(merge_edges), n_edges)) # merge node pairs with ufd ufd = nufd.boost_ufd(nodes) ufd.merge(uv_ids[merge_edges]) # get the node results and label them consecutively node_labeling = ufd.find(nodes) node_labeling, max_new_id, _ = relabelConsecutive(node_labeling, start_label=0, keep_zeros=False) assert node_labeling[0] == 0 # FIXME this looks fishy, redo !!! # # make sure that zero is still mapped to zero # if node_labeling[0] != 0: # # if it isn't, swap labels accordingly # zero_label = node_labeling[0] # to_relabel = node_labeling == 0 # node_labeling[node_labeling == zero_label] = 0 # node_labeling[to_relabel] = zero_laebl n_new_nodes = max_new_id + 1 fu.log("have %i nodes in new node labeling" % n_new_nodes) # get the labeling of initial nodes if initial_node_labeling is None: # if we don't have an initial node labeling, we are in the first scale. # here, the graph nodes might not be consecutive / not start at zero. # to keep the node labeling valid, we must make the labeling consecutive by inserting zeros fu.log("don't have an initial node labeling") # check if `nodes` are consecutive and start at zero node_max_id = int(nodes.max()) if node_max_id + 1 != len(nodes): fu.log("nodes are not consecutve and/or don't start at zero") fu.log("inflating node labels accordingly") node_labeling = nt.inflateLabeling(nodes, node_labeling, node_max_id) new_initial_node_labeling = node_labeling else: fu.log( "mapping new node labeling to labeling of inital (= scale 0) nodes" ) # NOTE access like this is ok because all node labelings will be consecutive new_initial_node_labeling = node_labeling[initial_node_labeling] assert len(new_initial_node_labeling) == len(initial_node_labeling) return n_new_nodes, node_labeling, new_initial_node_labeling
def find_labeling(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) with open(config_path, 'r') as f: config = json.load(f) n_jobs = config['n_jobs'] tmp_folder = config['tmp_folder'] n_threads = config['threads_per_job'] assignment_path = config['assignment_path'] assignment_key = config['assignment_key'] def _read_input(job_id): return np.load( os.path.join(tmp_folder, 'find_uniques_job_%i.npy' % job_id)) fu.log("read uniques") with futures.ThreadPoolExecutor(n_threads) as tp: tasks = [tp.submit(_read_input, job_id) for job_id in range(n_jobs)] uniques = np.concatenate([t.result() for t in tasks]) fu.log("compute uniques") uniques = np.unique(uniques) if uniques[0] == 0: start_label = 0 stop_label = len(uniques) else: start_label = 1 stop_label = len(uniques) + 1 fu.log("relabel to new max-id %i" % stop_label) new_ids = np.arange(start_label, stop_label, dtype='uint64') assignments = np.concatenate([uniques[:, None], new_ids[:, None]], axis=1) fu.log("saving results to %s/%s" % (assignment_path, assignment_key)) with vu.file_reader(assignment_path) as f: chunk_size = min(int(1e6), len(assignments)) chunks = (chunk_size, 2) ds = vu.force_dataset(f, assignment_key, shape=assignments.shape, dtype='uint64', compression='gzip', chunks=chunks) ds.n_threads = n_threads ds[:] = assignments # log success fu.log_job_success(job_id)
def solve_global(job_id, config_path): fu.log("start processing job %i" % job_id) fu.log("reading config from %s" % config_path) # get the config with open(config_path) as f: config = json.load(f) # path to the reduced problem problem_path = config['problem_path'] # path where the node labeling shall be written assignment_path = config['assignment_path'] assignment_key = config['assignment_key'] scale = config['scale'] agglomerator_key = config['agglomerator'] n_threads = config['threads_per_job'] time_limit = config.get('time_limit_solver', None) fu.log("using agglomerator %s" % agglomerator_key) if time_limit is None: fu.log("agglomeration without time limit") else: fu.log("agglomeration time limit %i" % time_limit) agglomerator = su.key_to_agglomerator(agglomerator_key) with vu.file_reader(problem_path, 'r') as f: group = f['s%i' % scale] graph_group = group['graph'] ignore_label = graph_group.attrs['ignoreLabel'] ds = graph_group['edges'] ds.n_threads = n_threads uv_ids = ds[:] n_edges = len(uv_ids) n_nodes = int(uv_ids.max() + 1) # we only need to load the initial node labeling if at # least one reduction step was performed i.e. scale > 0 if scale > 0: ds = group['node_labeling'] ds.n_threads = n_threads initial_node_labeling = ds[:] ds = group['costs'] ds.n_threads = n_threads costs = ds[:] assert len(costs) == n_edges, "%i, %i" (len(costs), n_edges) fu.log("creating graph with %i nodes an %i edges" % (n_nodes, len(uv_ids))) graph = nifty.graph.undirectedGraph(n_nodes) graph.insertEdges(uv_ids) fu.log("start agglomeration") node_labeling = agglomerator(graph, costs, n_threads=n_threads, time_limit=time_limit) fu.log("finished agglomeration") # get the labeling of initial nodes if scale > 0: initial_node_labeling = node_labeling[initial_node_labeling] else: initial_node_labeling = node_labeling n_nodes = len(initial_node_labeling) # make sure zero is mapped to 0 if we have an ignore label if ignore_label and initial_node_labeling[0] != 0: new_max_label = int(node_labeling.max() + 1) initial_node_labeling[initial_node_labeling == 0] = new_max_label initial_node_labeling[0] = 0 # make node labeling consecutive vigra.analysis.relabelConsecutive(initial_node_labeling, start_label=1, keep_zeros=True, out=initial_node_labeling) # write node labeling node_shape = (n_nodes, ) chunks = (min(n_nodes, 524288), ) with vu.file_reader(assignment_path) as f: ds = f.require_dataset(assignment_key, dtype='uint64', shape=node_shape, chunks=chunks, compression='gzip') ds.n_threads = n_threads ds[:] = initial_node_labeling fu.log('saving results to %s:%s' % (assignment_path, assignment_key)) fu.log_job_success(job_id)