def _diffusion_child(comm, bm=None): rank = comm.Get_rank() ngpus = comm.Get_size() nodename = socket.gethostname() name = '%s %s' % (nodename, rank) print(name) if rank == 0: # split indices on GPUs indices_split = _split_indices(bm.indices, ngpus) print('Indices:', indices_split) # send data to GPUs for k in range(1, ngpus): sendToChild(comm, bm.indices, indices_split[k], k, bm.data, bm.labels, bm.label.nbrw, bm.label.sorw, bm.label.allaxis) # init cuda device cuda.init() dev = cuda.Device(rank) ctx = dev.make_context() # select the desired script if bm.label.allaxis: from pycuda_small_allx import walk else: from pycuda_small import walk # run random walks tic = time.time() walkmap = walk(bm.data, bm.labels, bm.indices, indices_split[0], bm.label.nbrw, bm.label.sorw, name) tac = time.time() print('Walktime_%s: ' % (name) + str(int(tac - tic)) + ' ' + 'seconds') # gather data zsh_tmp = bm.argmax_z - bm.argmin_z ysh_tmp = bm.argmax_y - bm.argmin_y xsh_tmp = bm.argmax_x - bm.argmin_x if ngpus > 1: final_zero = np.empty((bm.nol, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) for k in range(bm.nol): sendbuf = np.copy(walkmap[k]) recvbuf = np.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) comm.Barrier() comm.Reduce([sendbuf, MPI.FLOAT], [recvbuf, MPI.FLOAT], root=0, op=MPI.SUM) final_zero[k] = recvbuf else: final_zero = walkmap # block and grid size block = (32, 32, 1) x_grid = (xsh_tmp // 32) + 1 y_grid = (ysh_tmp // 32) + 1 grid = (int(x_grid), int(y_grid), int(zsh_tmp)) xsh_gpu = np.int32(xsh_tmp) ysh_gpu = np.int32(ysh_tmp) # smooth if bm.label.smooth: try: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() a_gpu = gpuarray.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) b_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) except Exception as e: print( 'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.' ) bm.label.smooth = 0 if bm.label.smooth: final_smooth = np.copy(final_zero) for k in range(bm.nol): a_gpu = gpuarray.to_gpu(final_smooth[k]) for l in range(bm.label.smooth): curvature_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) update_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) final_smooth[k] = a_gpu.get() final_smooth = np.argmax(final_smooth, axis=0).astype(np.uint8) final_smooth = get_labels(final_smooth, bm.allLabels) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_smooth final = final[1:-1, 1:-1, 1:-1] save_data(bm.path_to_smooth, final, bm.header, bm.final_image_type, bm.label.compression) # uncertainty if bm.label.uncertainty: try: max_gpu = gpuarray.zeros((3, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) a_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) kernel_uncertainty = _build_kernel_uncertainty() kernel_max = _build_kernel_max() for k in range(bm.nol): a_gpu = gpuarray.to_gpu(final_zero[k]) kernel_max(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) kernel_uncertainty(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) uq = a_gpu.get() uq *= 255 uq = uq.astype(np.uint8) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = uq final = final[1:-1, 1:-1, 1:-1] save_data(bm.path_to_uq, final, compress=bm.label.compression) except Exception as e: print( 'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.' ) bm.label.uncertainty = False # free device ctx.pop() del ctx # argmax final_zero = np.argmax(final_zero, axis=0).astype(np.uint8) # save finals final_zero = get_labels(final_zero, bm.allLabels) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_zero final = final[1:-1, 1:-1, 1:-1] save_data(bm.path_to_final, final, bm.header, bm.final_image_type, bm.label.compression) # computation time t = int(time.time() - bm.TIC) if t < 60: time_str = str(t) + ' sec' elif 60 <= t < 3600: time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec' elif 3600 < t: time_str = str(t // 3600) + ' h ' + str( (t % 3600) // 60) + ' min ' + str(t % 60) + ' sec' print('Computation time:', time_str) else: data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=0) data = np.empty((data_z, data_y, data_x), dtype=data_dtype) if data_dtype == 'uint8': comm.Recv([data, MPI.BYTE], source=0, tag=1) else: comm.Recv([data, MPI.FLOAT], source=0, tag=1) allx, nbrw, sorw = comm.recv(source=0, tag=2) if allx: labels = [] for k in range(3): labels_z, labels_y, labels_x = comm.recv(source=0, tag=k + 3) labels_tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([labels_tmp, MPI.INT], source=0, tag=k + 6) labels.append(labels_tmp) else: labels_z, labels_y, labels_x = comm.recv(source=0, tag=3) labels = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([labels, MPI.INT], source=0, tag=6) indices = comm.recv(source=0, tag=9) indices_child = comm.recv(source=0, tag=10) # init cuda device cuda.init() dev = cuda.Device(rank % cuda.Device.count()) ctx = dev.make_context() # select the desired script if allx: from pycuda_small_allx import walk else: from pycuda_small import walk # run random walks tic = time.time() walkmap = walk(data, labels, indices, indices_child, nbrw, sorw, name) tac = time.time() print('Walktime_%s: ' % (name) + str(int(tac - tic)) + ' ' + 'seconds') # free device ctx.pop() del ctx # send data for k in range(walkmap.shape[0]): datatemporaer = np.copy(walkmap[k]) comm.Barrier() comm.Reduce([datatemporaer, MPI.FLOAT], None, root=0, op=MPI.SUM)
def walk(comm, raw, slices, indices, nbrw, sorw, blockmin, blockmax, name, allLabels, smooth, uncertainty): # get rank and size of mpi process rank = comm.Get_rank() size = comm.Get_size() # build kernels if raw.dtype == 'uint8': kernel = _build_kernel_int8() raw = (raw - 128).astype('int8') else: kernel = _build_kernel_float32() raw = raw.astype(np.float32) fill_gpu = _build_kernel_fill() # image size zsh, ysh, xsh = raw.shape xsh_gpu = np.int32(xsh) ysh_gpu = np.int32(ysh) zsh_gpu = np.int32(zsh) # block and gird size block = (32, 32, 1) x_grid = (xsh // 32) + 1 y_grid = (ysh // 32) + 1 grid2 = (int(x_grid), int(y_grid), int(zsh)) # hyper-parameter sorw = np.int32(sorw) nbrw = np.int32(nbrw) # crop to region of interest slices = slices.astype(np.int32) slices = reduceBlocksize(slices) # allocate hist arrays hits = np.empty(raw.shape, dtype=np.float32) final = np.zeros((blockmax - blockmin, ysh, xsh), dtype=np.uint8) memory_error = False try: if np.any(indices): slshape = slices.shape[0] indices = np.array(indices, dtype=np.int32) indices_gpu = gpuarray.to_gpu(indices) slices_gpu = gpuarray.to_gpu(slices) grid = (int(x_grid), int(y_grid), int(slshape)) raw_gpu = gpuarray.to_gpu(raw) hits_gpu = cuda.mem_alloc(hits.nbytes) sendbuf = np.zeros(1, dtype=np.int32) recvbuf = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX) except Exception as e: print('Error: GPU out of memory. Data too large.') sendbuf = np.zeros(1, dtype=np.int32) + 1 recvbuf = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX) if recvbuf > 0: memory_error = True try: hits_gpu.free() except: pass return memory_error, None, None, None if smooth: try: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() b_npy = np.zeros(raw.shape, dtype=np.float32) b_gpu = cuda.mem_alloc(b_npy.nbytes) cuda.memcpy_htod(b_gpu, b_npy) final_smooth = np.zeros((blockmax - blockmin, ysh, xsh), dtype=np.uint8) sendbuf_smooth = np.zeros(1, dtype=np.int32) recvbuf_smooth = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_smooth, MPI.INT], [recvbuf_smooth, MPI.INT], op=MPI.MAX) except Exception as e: print( 'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.' ) sendbuf_smooth = np.zeros(1, dtype=np.int32) + 1 recvbuf_smooth = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_smooth, MPI.INT], [recvbuf_smooth, MPI.INT], op=MPI.MAX) if recvbuf_smooth > 0: smooth = 0 try: b_gpu.free() except: pass if uncertainty: try: max_npy = np.zeros((3, ) + raw.shape, dtype=np.float32) max_gpu = cuda.mem_alloc(max_npy.nbytes) cuda.memcpy_htod(max_gpu, max_npy) kernel_uncertainty = _build_kernel_uncertainty() kernel_max = _build_kernel_max() sendbuf_uq = np.zeros(1, dtype=np.int32) recvbuf_uq = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT], op=MPI.MAX) except Exception as e: print( 'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.' ) sendbuf_uq = np.zeros(1, dtype=np.int32) + 1 recvbuf_uq = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT], op=MPI.MAX) if recvbuf_uq > 0: uncertainty = False try: max_gpu.free() except: pass for label_counter, segment in enumerate(allLabels): print('%s:' % (name) + ' ' + str(label_counter + 1) + '/' + str(len(allLabels))) # current segment segment_gpu = np.int32(segment) # reset array of hits fill_gpu(hits_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid2) # compute random walks if np.any(indices): kernel(segment_gpu, raw_gpu, slices_gpu, hits_gpu, xsh_gpu, ysh_gpu, zsh_gpu, indices_gpu, sorw, nbrw, block=block, grid=grid) # get hits cuda.memcpy_dtoh(hits, hits_gpu) # communicate hits if size > 1: hits = sendrecv(hits, blockmin, blockmax, comm, rank, size) if uncertainty or smooth: cuda.memcpy_htod(hits_gpu, hits) # save the three most occuring hits if uncertainty: kernel_max(max_gpu, hits_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid2) # smooth manifold if smooth: for k in range(smooth): curvature_gpu(hits_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid2) update_gpu(hits_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid2) hits_smooth = np.empty_like(hits) cuda.memcpy_dtoh(hits_smooth, hits_gpu) if label_counter == 0: hits_smooth[hits_smooth < 0] = 0 walkmap_smooth = np.copy(hits_smooth) else: walkmap_smooth, final_smooth = max_to_label( hits_smooth, walkmap_smooth, final_smooth, blockmin, blockmax, segment) # get the label with the most hits if label_counter == 0: walkmap = np.copy(hits) else: walkmap, final = max_to_label(hits, walkmap, final, blockmin, blockmax, segment) #update = hits[blockmin:blockmax] > walkmap[blockmin:blockmax] #walkmap[blockmin:blockmax][update] = hits[blockmin:blockmax][update] #final[update] = segment # compute uncertainty if uncertainty: kernel_uncertainty(max_gpu, hits_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid2) final_uncertainty = np.empty_like(hits) cuda.memcpy_dtoh(final_uncertainty, hits_gpu) final_uncertainty = final_uncertainty[blockmin:blockmax] else: final_uncertainty = None if not smooth: final_smooth = None try: hits_gpu.free() except: pass return memory_error, final, final_uncertainty, final_smooth
def _diffusion_child(comm, bm=None): rank = comm.Get_rank() ngpus = comm.Get_size() nodename = socket.gethostname() name = '%s %s' %(nodename, rank) print(name) if rank == 0: # split indices on GPUs indices_split = _split_indices(bm.indices, ngpus) print('Indices:', indices_split) # send data to GPUs for k in range(1, ngpus): sendToChild(comm, bm.indices, indices_split[k], k, bm.data, bm.labels, bm.label.nbrw, bm.label.sorw, bm.label.allaxis) # init cuda device cuda.init() dev = cuda.Device(rank) ctx = dev.make_context() # select the desired script if bm.label.allaxis: from pycuda_small_allx import walk else: from pycuda_small import walk # run random walks tic = time.time() walkmap = walk(bm.data, bm.labels, bm.indices, indices_split[0], bm.label.nbrw, bm.label.sorw, name) tac = time.time() print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds') # gather data zsh_tmp = bm.argmax_z - bm.argmin_z ysh_tmp = bm.argmax_y - bm.argmin_y xsh_tmp = bm.argmax_x - bm.argmin_x if ngpus > 1: final_zero = np.empty((bm.nol, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) for k in range(bm.nol): sendbuf = np.copy(walkmap[k]) recvbuf = np.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) comm.Barrier() comm.Reduce([sendbuf, MPI.FLOAT], [recvbuf, MPI.FLOAT], root=0, op=MPI.SUM) final_zero[k] = recvbuf else: final_zero = walkmap # block and grid size block = (32, 32, 1) x_grid = (xsh_tmp // 32) + 1 y_grid = (ysh_tmp // 32) + 1 grid = (int(x_grid), int(y_grid), int(zsh_tmp)) xsh_gpu = np.int32(xsh_tmp) ysh_gpu = np.int32(ysh_tmp) # smooth if bm.label.smooth: try: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() a_gpu = gpuarray.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) b_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) except Exception as e: print('Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.') bm.label.smooth = 0 if bm.label.smooth: final_smooth = np.copy(final_zero) for k in range(bm.nol): a_gpu = gpuarray.to_gpu(final_smooth[k]) for l in range(bm.label.smooth): curvature_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) update_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) final_smooth[k] = a_gpu.get() final_smooth = np.argmax(final_smooth, axis=0).astype(np.uint8) final_smooth = get_labels(final_smooth, bm.allLabels) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_smooth final = final[1:-1, 1:-1, 1:-1] bm.path_to_smooth = unique_file_path(bm.path_to_smooth, bm.image.user.username) save_data(bm.path_to_smooth, final, bm.header, bm.final_image_type, bm.label.compression) # uncertainty if bm.label.uncertainty: try: max_gpu = gpuarray.zeros((3, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) a_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32) kernel_uncertainty = _build_kernel_uncertainty() kernel_max = _build_kernel_max() for k in range(bm.nol): a_gpu = gpuarray.to_gpu(final_zero[k]) kernel_max(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) kernel_uncertainty(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid) uq = a_gpu.get() uq *= 255 uq = uq.astype(np.uint8) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = uq final = final[1:-1, 1:-1, 1:-1] bm.path_to_uq = unique_file_path(bm.path_to_uq, bm.image.user.username) save_data(bm.path_to_uq, final, compress=bm.label.compression) except Exception as e: print('Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.') bm.label.uncertainty = False # free device ctx.pop() del ctx # argmax final_zero = np.argmax(final_zero, axis=0).astype(np.uint8) # save finals final_zero = get_labels(final_zero, bm.allLabels) final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8) final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_zero final = final[1:-1, 1:-1, 1:-1] bm.path_to_final = unique_file_path(bm.path_to_final, bm.image.user.username) save_data(bm.path_to_final, final, bm.header, bm.final_image_type, bm.label.compression) # create final objects shortfilename = os.path.basename(bm.path_to_final) filename = 'images/' + bm.image.user.username + '/' + shortfilename tmp = Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=1, active=1, imageType=3, shortfilename=shortfilename) tmp.friend = tmp.id tmp.save() if bm.label.uncertainty: shortfilename = os.path.basename(bm.path_to_uq) filename = 'images/' + bm.image.user.username + '/' + shortfilename Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=4, imageType=3, shortfilename=shortfilename, friend=tmp.id) if bm.label.smooth: shortfilename = os.path.basename(bm.path_to_smooth) filename = 'images/' + bm.image.user.username + '/' + shortfilename smooth = Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=5, imageType=3, shortfilename=shortfilename, friend=tmp.id) # write in logs t = int(time.time() - bm.TIC) if t < 60: time_str = str(t) + ' sec' elif 60 <= t < 3600: time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec' elif 3600 < t: time_str = str(t // 3600) + ' h ' + str((t % 3600) // 60) + ' min ' + str(t % 60) + ' sec' with open(bm.path_to_time, 'a') as timefile: print('%s %s %s %s MB %s on %s' %(time.ctime(), bm.image.user.username, bm.image.shortfilename, bm.imageSize, time_str, config['SERVER_ALIAS']), file=timefile) print('Total calculation time:', time_str) # send notification send_notification(bm.image.user.username, bm.image.shortfilename, time_str, config['SERVER_ALIAS']) # start subprocesses if config['OS'] == 'linux': # acwe q = Queue('acwe', connection=Redis()) job = q.enqueue_call(active_contour, args=(bm.image.id, tmp.id, bm.label.id,), timeout=-1) # cleanup q = Queue('cleanup', connection=Redis()) job = q.enqueue_call(remove_outlier, args=(bm.image.id, tmp.id, tmp.id, bm.label.id,), timeout=-1) if bm.label.smooth: job = q.enqueue_call(remove_outlier, args=(bm.image.id, smooth.id, tmp.id, bm.label.id, False,), timeout=-1) # create slices q = Queue('slices', connection=Redis()) job = q.enqueue_call(create_slices, args=(bm.path_to_data, bm.path_to_final,), timeout=-1) if bm.label.smooth: job = q.enqueue_call(create_slices, args=(bm.path_to_data, bm.path_to_smooth,), timeout=-1) if bm.label.uncertainty: job = q.enqueue_call(create_slices, args=(bm.path_to_uq, None,), timeout=-1) elif config['OS'] == 'windows': # acwe Process(target=active_contour, args=(bm.image.id, tmp.id, bm.label.id)).start() # cleanup Process(target=remove_outlier, args=(bm.image.id, tmp.id, tmp.id, bm.label.id)).start() if bm.label.smooth: Process(target=remove_outlier, args=(bm.image.id, smooth.id, tmp.id, bm.label.id, False)).start() # create slices Process(target=create_slices, args=(bm.path_to_data, bm.path_to_final)).start() if bm.label.smooth: Process(target=create_slices, args=(bm.path_to_data, bm.path_to_smooth)).start() if bm.label.uncertainty: Process(target=create_slices, args=(bm.path_to_uq, None)).start() else: data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=0) data = np.empty((data_z, data_y, data_x), dtype=data_dtype) if data_dtype == 'uint8': comm.Recv([data, MPI.BYTE], source=0, tag=1) else: comm.Recv([data, MPI.FLOAT], source=0, tag=1) allx, nbrw, sorw = comm.recv(source=0, tag=2) if allx: labels = [] for k in range(3): labels_z, labels_y, labels_x = comm.recv(source=0, tag=k+3) labels_tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([labels_tmp, MPI.INT], source=0, tag=k+6) labels.append(labels_tmp) else: labels_z, labels_y, labels_x = comm.recv(source=0, tag=3) labels = np.empty((labels_z, labels_y, labels_x), dtype=np.int32) comm.Recv([labels, MPI.INT], source=0, tag=6) indices = comm.recv(source=0, tag=9) indices_child = comm.recv(source=0, tag=10) # init cuda device cuda.init() dev = cuda.Device(rank) ctx = dev.make_context() # select the desired script if allx: from pycuda_small_allx import walk else: from pycuda_small import walk # run random walks tic = time.time() walkmap = walk(data, labels, indices, indices_child, nbrw, sorw, name) tac = time.time() print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds') # free device ctx.pop() del ctx # send data for k in range(walkmap.shape[0]): datatemporaer = np.copy(walkmap[k]) comm.Barrier() comm.Reduce([datatemporaer, MPI.FLOAT], None, root=0, op=MPI.SUM)
def walk(comm, raw, slices, indices, nbrw, sorw, blockmin, blockmax, name, allLabels, smooth, uncertainty): rank = comm.Get_rank() size = comm.Get_size() if raw.dtype == 'uint8': kernel = _build_kernel_int8() raw = (raw - 128).astype('int8') else: kernel = _build_kernel_float32() raw = raw.astype(np.float32) foundAxis = [0] * 3 for k in range(3): if indices[k]: foundAxis[k] = 1 zsh, ysh, xsh = raw.shape fill_gpu = _build_kernel_fill() block = (32, 32, 1) x_grid = (xsh // 32) + 1 y_grid = (ysh // 32) + 1 grid2 = (int(x_grid), int(y_grid), int(zsh)) a = np.empty(raw.shape, dtype=np.float32) final = np.zeros((blockmax - blockmin, ysh, xsh), dtype=np.uint8) segment_npy = np.empty(1, dtype=np.uint8) memory_error = False try: raw_gpu = gpuarray.to_gpu(raw) a_gpu = cuda.mem_alloc(a.nbytes) if smooth: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() b_gpu = gpuarray.zeros(raw.shape, dtype=np.float32) zshape = np.int32(zsh) yshape = np.int32(ysh) xshape = np.int32(xsh) sorw = np.int32(sorw) nbrw = np.int32(nbrw) slshape = [None] * 3 indices_gpu = [None] * 3 beta_gpu = [None] * 3 slices_gpu = [None] * 3 ysh = [None] * 3 xsh = [None] * 3 for k, found in enumerate(foundAxis): if found: indices_tmp = np.array(indices[k], dtype=np.int32) slices_tmp = slices[k].astype(np.int32) slices_tmp = reduceBlocksize(slices_tmp) slshape[k], ysh[k], xsh[k] = slices_tmp.shape indices_gpu[k] = gpuarray.to_gpu(indices_tmp) slices_gpu[k] = gpuarray.to_gpu(slices_tmp) Beta = np.zeros(slices_tmp.shape, dtype=np.float32) for m in range(slshape[k]): for n in allLabels: A = _calc_label_walking_area(slices_tmp[m], n) plane = indices_tmp[m] if k == 0: raw_tmp = raw[plane] if k == 1: raw_tmp = raw[:, plane] if k == 2: raw_tmp = raw[:, :, plane] Beta[m] += _calc_var(raw_tmp.astype(float), A) beta_gpu[k] = gpuarray.to_gpu(Beta) sendbuf = np.zeros(1, dtype=np.int32) recvbuf = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX) except Exception as e: print('Error: GPU out of memory. Data too large.') sendbuf = np.zeros(1, dtype=np.int32) + 1 recvbuf = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX) if recvbuf > 0: memory_error = True try: a_gpu.free() except: pass return memory_error, None, None, None if smooth: try: update_gpu = _build_update_gpu() curvature_gpu = _build_curvature_gpu() b_npy = np.zeros(raw.shape, dtype=np.float32) b_gpu = cuda.mem_alloc(b_npy.nbytes) cuda.memcpy_htod(b_gpu, b_npy) final_smooth = np.zeros((blockmax - blockmin, yshape, xshape), dtype=np.uint8) sendbuf_smooth = np.zeros(1, dtype=np.int32) recvbuf_smooth = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_smooth, MPI.INT], [recvbuf_smooth, MPI.INT], op=MPI.MAX) except Exception as e: print( 'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.' ) sendbuf_smooth = np.zeros(1, dtype=np.int32) + 1 recvbuf_smooth = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_smooth, MPI.INT], [recvbuf_smooth, MPI.INT], op=MPI.MAX) if recvbuf_smooth > 0: smooth = 0 try: b_gpu.free() except: pass if uncertainty: try: max_npy = np.zeros((3, ) + raw.shape, dtype=np.float32) max_gpu = cuda.mem_alloc(max_npy.nbytes) cuda.memcpy_htod(max_gpu, max_npy) kernel_uncertainty = _build_kernel_uncertainty() kernel_max = _build_kernel_max() sendbuf_uq = np.zeros(1, dtype=np.int32) recvbuf_uq = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT], op=MPI.MAX) except Exception as e: print( 'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.' ) sendbuf_uq = np.zeros(1, dtype=np.int32) + 1 recvbuf_uq = np.zeros(1, dtype=np.int32) comm.Barrier() comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT], op=MPI.MAX) if recvbuf_uq > 0: uncertainty = False try: max_gpu.free() except: pass for label_counter, segment in enumerate(allLabels): print('%s:' % (name) + ' ' + str(label_counter + 1) + '/' + str(len(allLabels))) fill_gpu(a_gpu, xshape, yshape, block=block, grid=grid2) segment_gpu = np.int32(segment) segment_npy.fill(segment) for k, found in enumerate(foundAxis): if found: axis_gpu = np.int32(k) x_grid = (xsh[k] // 32) + 1 y_grid = (ysh[k] // 32) + 1 grid = (int(x_grid), int(y_grid), int(slshape[k])) kernel(axis_gpu, segment_gpu, raw_gpu, slices_gpu[k], a_gpu, xshape, yshape, zshape, indices_gpu[k], sorw, beta_gpu[k], nbrw, block=block, grid=grid) cuda.memcpy_dtoh(a, a_gpu) if size > 1: a = sendrecv(a, blockmin, blockmax, comm, rank, size) if smooth or uncertainty: cuda.memcpy_htod(a_gpu, a) if uncertainty: kernel_max(max_gpu, a_gpu, xshape, yshape, block=block, grid=grid2) if smooth: for k in range(smooth): curvature_gpu(a_gpu, b_gpu, xshape, yshape, block=block, grid=grid2) update_gpu(a_gpu, b_gpu, xshape, yshape, block=block, grid=grid2) a_smooth = np.empty_like(a) cuda.memcpy_dtoh(a_smooth, a_gpu) if label_counter == 0: a_smooth[a_smooth < 0] = 0 walkmap_smooth = np.copy(a_smooth) else: walkmap_smooth, final_smooth = max_to_label( a_smooth, walkmap_smooth, final_smooth, blockmin, blockmax, segment) if label_counter == 0: a[a < 0] = 0 walkmap = np.copy(a) else: walkmap, final = max_to_label(a, walkmap, final, blockmin, blockmax, segment) if uncertainty: kernel_uncertainty(max_gpu, a_gpu, xshape, yshape, block=block, grid=grid2) final_uncertainty = np.empty_like(a) cuda.memcpy_dtoh(final_uncertainty, a_gpu) final_uncertainty = final_uncertainty[blockmin:blockmax] else: final_uncertainty = None if not smooth: final_smooth = None try: a_gpu.free() except: pass return memory_error, final, final_uncertainty, final_smooth