Python _build_update_gpu Examples

Programming Language: Python

Namespace/Package Name: gpu_kernels

Method/Function: _build_update_gpu

Examples at hotexamples.com: 4

Python _build_update_gpu - 4 examples found. These are the top rated real world Python examples of gpu_kernels._build_update_gpu extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def _diffusion_child(comm, bm=None):

    rank = comm.Get_rank()
    ngpus = comm.Get_size()

    nodename = socket.gethostname()
    name = '%s %s' % (nodename, rank)
    print(name)

    if rank == 0:

        # split indices on GPUs
        indices_split = _split_indices(bm.indices, ngpus)
        print('Indices:', indices_split)

        # send data to GPUs
        for k in range(1, ngpus):
            sendToChild(comm, bm.indices, indices_split[k], k, bm.data,
                        bm.labels, bm.label.nbrw, bm.label.sorw,
                        bm.label.allaxis)

        # init cuda device
        cuda.init()
        dev = cuda.Device(rank)
        ctx = dev.make_context()

        # select the desired script
        if bm.label.allaxis:
            from pycuda_small_allx import walk
        else:
            from pycuda_small import walk

        # run random walks
        tic = time.time()
        walkmap = walk(bm.data, bm.labels, bm.indices, indices_split[0],
                       bm.label.nbrw, bm.label.sorw, name)
        tac = time.time()
        print('Walktime_%s: ' % (name) + str(int(tac - tic)) + ' ' + 'seconds')

        # gather data
        zsh_tmp = bm.argmax_z - bm.argmin_z
        ysh_tmp = bm.argmax_y - bm.argmin_y
        xsh_tmp = bm.argmax_x - bm.argmin_x
        if ngpus > 1:
            final_zero = np.empty((bm.nol, zsh_tmp, ysh_tmp, xsh_tmp),
                                  dtype=np.float32)
            for k in range(bm.nol):
                sendbuf = np.copy(walkmap[k])
                recvbuf = np.empty((zsh_tmp, ysh_tmp, xsh_tmp),
                                   dtype=np.float32)
                comm.Barrier()
                comm.Reduce([sendbuf, MPI.FLOAT], [recvbuf, MPI.FLOAT],
                            root=0,
                            op=MPI.SUM)
                final_zero[k] = recvbuf
        else:
            final_zero = walkmap

        # block and grid size
        block = (32, 32, 1)
        x_grid = (xsh_tmp // 32) + 1
        y_grid = (ysh_tmp // 32) + 1
        grid = (int(x_grid), int(y_grid), int(zsh_tmp))
        xsh_gpu = np.int32(xsh_tmp)
        ysh_gpu = np.int32(ysh_tmp)

        # smooth
        if bm.label.smooth:
            try:
                update_gpu = _build_update_gpu()
                curvature_gpu = _build_curvature_gpu()
                a_gpu = gpuarray.empty((zsh_tmp, ysh_tmp, xsh_tmp),
                                       dtype=np.float32)
                b_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp),
                                       dtype=np.float32)
            except Exception as e:
                print(
                    'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.'
                )
                bm.label.smooth = 0

        if bm.label.smooth:
            final_smooth = np.copy(final_zero)
            for k in range(bm.nol):
                a_gpu = gpuarray.to_gpu(final_smooth[k])
                for l in range(bm.label.smooth):
                    curvature_gpu(a_gpu,
                                  b_gpu,
                                  xsh_gpu,
                                  ysh_gpu,
                                  block=block,
                                  grid=grid)
                    update_gpu(a_gpu,
                               b_gpu,
                               xsh_gpu,
                               ysh_gpu,
                               block=block,
                               grid=grid)
                final_smooth[k] = a_gpu.get()
            final_smooth = np.argmax(final_smooth, axis=0).astype(np.uint8)
            final_smooth = get_labels(final_smooth, bm.allLabels)
            final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
            final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y,
                  bm.argmin_x:bm.argmax_x] = final_smooth
            final = final[1:-1, 1:-1, 1:-1]
            save_data(bm.path_to_smooth, final, bm.header, bm.final_image_type,
                      bm.label.compression)

        # uncertainty
        if bm.label.uncertainty:
            try:
                max_gpu = gpuarray.zeros((3, zsh_tmp, ysh_tmp, xsh_tmp),
                                         dtype=np.float32)
                a_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp),
                                       dtype=np.float32)
                kernel_uncertainty = _build_kernel_uncertainty()
                kernel_max = _build_kernel_max()
                for k in range(bm.nol):
                    a_gpu = gpuarray.to_gpu(final_zero[k])
                    kernel_max(max_gpu,
                               a_gpu,
                               xsh_gpu,
                               ysh_gpu,
                               block=block,
                               grid=grid)
                kernel_uncertainty(max_gpu,
                                   a_gpu,
                                   xsh_gpu,
                                   ysh_gpu,
                                   block=block,
                                   grid=grid)
                uq = a_gpu.get()
                uq *= 255
                uq = uq.astype(np.uint8)
                final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
                final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y,
                      bm.argmin_x:bm.argmax_x] = uq
                final = final[1:-1, 1:-1, 1:-1]
                save_data(bm.path_to_uq, final, compress=bm.label.compression)
            except Exception as e:
                print(
                    'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.'
                )
                bm.label.uncertainty = False

        # free device
        ctx.pop()
        del ctx

        # argmax
        final_zero = np.argmax(final_zero, axis=0).astype(np.uint8)

        # save finals
        final_zero = get_labels(final_zero, bm.allLabels)
        final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
        final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y,
              bm.argmin_x:bm.argmax_x] = final_zero
        final = final[1:-1, 1:-1, 1:-1]
        save_data(bm.path_to_final, final, bm.header, bm.final_image_type,
                  bm.label.compression)

        # computation time
        t = int(time.time() - bm.TIC)
        if t < 60:
            time_str = str(t) + ' sec'
        elif 60 <= t < 3600:
            time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec'
        elif 3600 < t:
            time_str = str(t // 3600) + ' h ' + str(
                (t % 3600) // 60) + ' min ' + str(t % 60) + ' sec'
        print('Computation time:', time_str)

    else:

        data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=0)
        data = np.empty((data_z, data_y, data_x), dtype=data_dtype)
        if data_dtype == 'uint8':
            comm.Recv([data, MPI.BYTE], source=0, tag=1)
        else:
            comm.Recv([data, MPI.FLOAT], source=0, tag=1)
        allx, nbrw, sorw = comm.recv(source=0, tag=2)
        if allx:
            labels = []
            for k in range(3):
                labels_z, labels_y, labels_x = comm.recv(source=0, tag=k + 3)
                labels_tmp = np.empty((labels_z, labels_y, labels_x),
                                      dtype=np.int32)
                comm.Recv([labels_tmp, MPI.INT], source=0, tag=k + 6)
                labels.append(labels_tmp)
        else:
            labels_z, labels_y, labels_x = comm.recv(source=0, tag=3)
            labels = np.empty((labels_z, labels_y, labels_x), dtype=np.int32)
            comm.Recv([labels, MPI.INT], source=0, tag=6)
        indices = comm.recv(source=0, tag=9)
        indices_child = comm.recv(source=0, tag=10)

        # init cuda device
        cuda.init()
        dev = cuda.Device(rank % cuda.Device.count())
        ctx = dev.make_context()

        # select the desired script
        if allx:
            from pycuda_small_allx import walk
        else:
            from pycuda_small import walk

        # run random walks
        tic = time.time()
        walkmap = walk(data, labels, indices, indices_child, nbrw, sorw, name)
        tac = time.time()
        print('Walktime_%s: ' % (name) + str(int(tac - tic)) + ' ' + 'seconds')

        # free device
        ctx.pop()
        del ctx

        # send data
        for k in range(walkmap.shape[0]):
            datatemporaer = np.copy(walkmap[k])
            comm.Barrier()
            comm.Reduce([datatemporaer, MPI.FLOAT], None, root=0, op=MPI.SUM)

Example #2

Show file

File: pycuda_large.py Project: philipp-loesel/biomedisa

def walk(comm, raw, slices, indices, nbrw, sorw, blockmin, blockmax, name,
         allLabels, smooth, uncertainty):

    # get rank and size of mpi process
    rank = comm.Get_rank()
    size = comm.Get_size()

    # build kernels
    if raw.dtype == 'uint8':
        kernel = _build_kernel_int8()
        raw = (raw - 128).astype('int8')
    else:
        kernel = _build_kernel_float32()
        raw = raw.astype(np.float32)
    fill_gpu = _build_kernel_fill()

    # image size
    zsh, ysh, xsh = raw.shape
    xsh_gpu = np.int32(xsh)
    ysh_gpu = np.int32(ysh)
    zsh_gpu = np.int32(zsh)

    # block and gird size
    block = (32, 32, 1)
    x_grid = (xsh // 32) + 1
    y_grid = (ysh // 32) + 1
    grid2 = (int(x_grid), int(y_grid), int(zsh))

    # hyper-parameter
    sorw = np.int32(sorw)
    nbrw = np.int32(nbrw)

    # crop to region of interest
    slices = slices.astype(np.int32)
    slices = reduceBlocksize(slices)

    # allocate hist arrays
    hits = np.empty(raw.shape, dtype=np.float32)
    final = np.zeros((blockmax - blockmin, ysh, xsh), dtype=np.uint8)

    memory_error = False

    try:
        if np.any(indices):
            slshape = slices.shape[0]
            indices = np.array(indices, dtype=np.int32)
            indices_gpu = gpuarray.to_gpu(indices)
            slices_gpu = gpuarray.to_gpu(slices)
            grid = (int(x_grid), int(y_grid), int(slshape))

        raw_gpu = gpuarray.to_gpu(raw)
        hits_gpu = cuda.mem_alloc(hits.nbytes)

        sendbuf = np.zeros(1, dtype=np.int32)
        recvbuf = np.zeros(1, dtype=np.int32)
        comm.Barrier()
        comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX)

    except Exception as e:
        print('Error: GPU out of memory. Data too large.')
        sendbuf = np.zeros(1, dtype=np.int32) + 1
        recvbuf = np.zeros(1, dtype=np.int32)
        comm.Barrier()
        comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX)

    if recvbuf > 0:
        memory_error = True
        try:
            hits_gpu.free()
        except:
            pass
        return memory_error, None, None, None

    if smooth:
        try:
            update_gpu = _build_update_gpu()
            curvature_gpu = _build_curvature_gpu()
            b_npy = np.zeros(raw.shape, dtype=np.float32)
            b_gpu = cuda.mem_alloc(b_npy.nbytes)
            cuda.memcpy_htod(b_gpu, b_npy)
            final_smooth = np.zeros((blockmax - blockmin, ysh, xsh),
                                    dtype=np.uint8)
            sendbuf_smooth = np.zeros(1, dtype=np.int32)
            recvbuf_smooth = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_smooth, MPI.INT],
                           [recvbuf_smooth, MPI.INT],
                           op=MPI.MAX)
        except Exception as e:
            print(
                'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.'
            )
            sendbuf_smooth = np.zeros(1, dtype=np.int32) + 1
            recvbuf_smooth = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_smooth, MPI.INT],
                           [recvbuf_smooth, MPI.INT],
                           op=MPI.MAX)
        if recvbuf_smooth > 0:
            smooth = 0
            try:
                b_gpu.free()
            except:
                pass

    if uncertainty:
        try:
            max_npy = np.zeros((3, ) + raw.shape, dtype=np.float32)
            max_gpu = cuda.mem_alloc(max_npy.nbytes)
            cuda.memcpy_htod(max_gpu, max_npy)
            kernel_uncertainty = _build_kernel_uncertainty()
            kernel_max = _build_kernel_max()
            sendbuf_uq = np.zeros(1, dtype=np.int32)
            recvbuf_uq = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT],
                           op=MPI.MAX)
        except Exception as e:
            print(
                'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.'
            )
            sendbuf_uq = np.zeros(1, dtype=np.int32) + 1
            recvbuf_uq = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT],
                           op=MPI.MAX)
        if recvbuf_uq > 0:
            uncertainty = False
            try:
                max_gpu.free()
            except:
                pass

    for label_counter, segment in enumerate(allLabels):
        print('%s:' % (name) + ' ' + str(label_counter + 1) + '/' +
              str(len(allLabels)))

        # current segment
        segment_gpu = np.int32(segment)

        # reset array of hits
        fill_gpu(hits_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid2)

        # compute random walks
        if np.any(indices):
            kernel(segment_gpu,
                   raw_gpu,
                   slices_gpu,
                   hits_gpu,
                   xsh_gpu,
                   ysh_gpu,
                   zsh_gpu,
                   indices_gpu,
                   sorw,
                   nbrw,
                   block=block,
                   grid=grid)

        # get hits
        cuda.memcpy_dtoh(hits, hits_gpu)

        # communicate hits
        if size > 1:
            hits = sendrecv(hits, blockmin, blockmax, comm, rank, size)
            if uncertainty or smooth:
                cuda.memcpy_htod(hits_gpu, hits)

        # save the three most occuring hits
        if uncertainty:
            kernel_max(max_gpu,
                       hits_gpu,
                       xsh_gpu,
                       ysh_gpu,
                       block=block,
                       grid=grid2)

        # smooth manifold
        if smooth:
            for k in range(smooth):
                curvature_gpu(hits_gpu,
                              b_gpu,
                              xsh_gpu,
                              ysh_gpu,
                              block=block,
                              grid=grid2)
                update_gpu(hits_gpu,
                           b_gpu,
                           xsh_gpu,
                           ysh_gpu,
                           block=block,
                           grid=grid2)
            hits_smooth = np.empty_like(hits)
            cuda.memcpy_dtoh(hits_smooth, hits_gpu)
            if label_counter == 0:
                hits_smooth[hits_smooth < 0] = 0
                walkmap_smooth = np.copy(hits_smooth)
            else:
                walkmap_smooth, final_smooth = max_to_label(
                    hits_smooth, walkmap_smooth, final_smooth, blockmin,
                    blockmax, segment)

        # get the label with the most hits
        if label_counter == 0:
            walkmap = np.copy(hits)
        else:
            walkmap, final = max_to_label(hits, walkmap, final, blockmin,
                                          blockmax, segment)
            #update = hits[blockmin:blockmax] > walkmap[blockmin:blockmax]
            #walkmap[blockmin:blockmax][update] = hits[blockmin:blockmax][update]
            #final[update] = segment

    # compute uncertainty
    if uncertainty:
        kernel_uncertainty(max_gpu,
                           hits_gpu,
                           xsh_gpu,
                           ysh_gpu,
                           block=block,
                           grid=grid2)
        final_uncertainty = np.empty_like(hits)
        cuda.memcpy_dtoh(final_uncertainty, hits_gpu)
        final_uncertainty = final_uncertainty[blockmin:blockmax]
    else:
        final_uncertainty = None

    if not smooth:
        final_smooth = None

    try:
        hits_gpu.free()
    except:
        pass

    return memory_error, final, final_uncertainty, final_smooth

Example #3

Show file

def _diffusion_child(comm, bm=None):

    rank = comm.Get_rank()
    ngpus = comm.Get_size()

    nodename = socket.gethostname()
    name = '%s %s' %(nodename, rank)
    print(name)

    if rank == 0:

        # split indices on GPUs
        indices_split = _split_indices(bm.indices, ngpus)
        print('Indices:', indices_split)

        # send data to GPUs
        for k in range(1, ngpus):
            sendToChild(comm, bm.indices, indices_split[k], k, bm.data, bm.labels, bm.label.nbrw, bm.label.sorw, bm.label.allaxis)

        # init cuda device
        cuda.init()
        dev = cuda.Device(rank)
        ctx = dev.make_context()

        # select the desired script
        if bm.label.allaxis:
            from pycuda_small_allx import walk
        else:
            from pycuda_small import walk

        # run random walks
        tic = time.time()
        walkmap = walk(bm.data, bm.labels, bm.indices, indices_split[0], bm.label.nbrw, bm.label.sorw, name)
        tac = time.time()
        print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds')

        # gather data
        zsh_tmp = bm.argmax_z - bm.argmin_z
        ysh_tmp = bm.argmax_y - bm.argmin_y
        xsh_tmp = bm.argmax_x - bm.argmin_x
        if ngpus > 1:
            final_zero = np.empty((bm.nol, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
            for k in range(bm.nol):
                sendbuf = np.copy(walkmap[k])
                recvbuf = np.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
                comm.Barrier()
                comm.Reduce([sendbuf, MPI.FLOAT], [recvbuf, MPI.FLOAT], root=0, op=MPI.SUM)
                final_zero[k] = recvbuf
        else:
            final_zero = walkmap

        # block and grid size
        block = (32, 32, 1)
        x_grid = (xsh_tmp // 32) + 1
        y_grid = (ysh_tmp // 32) + 1
        grid = (int(x_grid), int(y_grid), int(zsh_tmp))
        xsh_gpu = np.int32(xsh_tmp)
        ysh_gpu = np.int32(ysh_tmp)

        # smooth
        if bm.label.smooth:
            try:
                update_gpu = _build_update_gpu()
                curvature_gpu = _build_curvature_gpu()
                a_gpu = gpuarray.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
                b_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
            except Exception as e:
                print('Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.')
                bm.label.smooth = 0

        if bm.label.smooth:
            final_smooth = np.copy(final_zero)
            for k in range(bm.nol):
                a_gpu = gpuarray.to_gpu(final_smooth[k])
                for l in range(bm.label.smooth):
                    curvature_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid)
                    update_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid)
                final_smooth[k] = a_gpu.get()
            final_smooth = np.argmax(final_smooth, axis=0).astype(np.uint8)
            final_smooth = get_labels(final_smooth, bm.allLabels)
            final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
            final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_smooth
            final = final[1:-1, 1:-1, 1:-1]
            bm.path_to_smooth = unique_file_path(bm.path_to_smooth, bm.image.user.username)
            save_data(bm.path_to_smooth, final, bm.header, bm.final_image_type, bm.label.compression)

        # uncertainty
        if bm.label.uncertainty:
            try:
                max_gpu = gpuarray.zeros((3, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
                a_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
                kernel_uncertainty = _build_kernel_uncertainty()
                kernel_max = _build_kernel_max()
                for k in range(bm.nol):
                    a_gpu = gpuarray.to_gpu(final_zero[k])
                    kernel_max(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid)
                kernel_uncertainty(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid)
                uq = a_gpu.get()
                uq *= 255
                uq = uq.astype(np.uint8)
                final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
                final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = uq
                final = final[1:-1, 1:-1, 1:-1]
                bm.path_to_uq = unique_file_path(bm.path_to_uq, bm.image.user.username)
                save_data(bm.path_to_uq, final, compress=bm.label.compression)
            except Exception as e:
                print('Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.')
                bm.label.uncertainty = False

        # free device
        ctx.pop()
        del ctx

        # argmax
        final_zero = np.argmax(final_zero, axis=0).astype(np.uint8)

        # save finals
        final_zero = get_labels(final_zero, bm.allLabels)
        final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
        final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_zero
        final = final[1:-1, 1:-1, 1:-1]
        bm.path_to_final = unique_file_path(bm.path_to_final, bm.image.user.username)
        save_data(bm.path_to_final, final, bm.header, bm.final_image_type, bm.label.compression)

        # create final objects
        shortfilename = os.path.basename(bm.path_to_final)
        filename = 'images/' + bm.image.user.username + '/' + shortfilename
        tmp = Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=1, active=1, imageType=3, shortfilename=shortfilename)
        tmp.friend = tmp.id
        tmp.save()
        if bm.label.uncertainty:
            shortfilename = os.path.basename(bm.path_to_uq)
            filename = 'images/' + bm.image.user.username + '/' + shortfilename
            Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=4, imageType=3, shortfilename=shortfilename, friend=tmp.id)
        if bm.label.smooth:
            shortfilename = os.path.basename(bm.path_to_smooth)
            filename = 'images/' + bm.image.user.username + '/' + shortfilename
            smooth = Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=5, imageType=3, shortfilename=shortfilename, friend=tmp.id)

        # write in logs
        t = int(time.time() - bm.TIC)
        if t < 60:
            time_str = str(t) + ' sec'
        elif 60 <= t < 3600:
            time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec'
        elif 3600 < t:
            time_str = str(t // 3600) + ' h ' + str((t % 3600) // 60) + ' min ' + str(t % 60) + ' sec'
        with open(bm.path_to_time, 'a') as timefile:
            print('%s %s %s %s MB %s on %s' %(time.ctime(), bm.image.user.username, bm.image.shortfilename, bm.imageSize, time_str, config['SERVER_ALIAS']), file=timefile)
        print('Total calculation time:', time_str)

        # send notification
        send_notification(bm.image.user.username, bm.image.shortfilename, time_str, config['SERVER_ALIAS'])

        # start subprocesses
        if config['OS'] == 'linux':
            # acwe
            q = Queue('acwe', connection=Redis())
            job = q.enqueue_call(active_contour, args=(bm.image.id, tmp.id, bm.label.id,), timeout=-1)

            # cleanup
            q = Queue('cleanup', connection=Redis())
            job = q.enqueue_call(remove_outlier, args=(bm.image.id, tmp.id, tmp.id, bm.label.id,), timeout=-1)
            if bm.label.smooth:
                job = q.enqueue_call(remove_outlier, args=(bm.image.id, smooth.id, tmp.id, bm.label.id, False,), timeout=-1)

            # create slices
            q = Queue('slices', connection=Redis())
            job = q.enqueue_call(create_slices, args=(bm.path_to_data, bm.path_to_final,), timeout=-1)
            if bm.label.smooth:
                job = q.enqueue_call(create_slices, args=(bm.path_to_data, bm.path_to_smooth,), timeout=-1)
            if bm.label.uncertainty:
                job = q.enqueue_call(create_slices, args=(bm.path_to_uq, None,), timeout=-1)

        elif config['OS'] == 'windows':

            # acwe
            Process(target=active_contour, args=(bm.image.id, tmp.id, bm.label.id)).start()

            # cleanup
            Process(target=remove_outlier, args=(bm.image.id, tmp.id, tmp.id, bm.label.id)).start()
            if bm.label.smooth:
                Process(target=remove_outlier, args=(bm.image.id, smooth.id, tmp.id, bm.label.id, False)).start()

            # create slices
            Process(target=create_slices, args=(bm.path_to_data, bm.path_to_final)).start()
            if bm.label.smooth:
                Process(target=create_slices, args=(bm.path_to_data, bm.path_to_smooth)).start()
            if bm.label.uncertainty:
                Process(target=create_slices, args=(bm.path_to_uq, None)).start()

    else:

        data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=0)
        data = np.empty((data_z, data_y, data_x), dtype=data_dtype)
        if data_dtype == 'uint8':
            comm.Recv([data, MPI.BYTE], source=0, tag=1)
        else:
            comm.Recv([data, MPI.FLOAT], source=0, tag=1)
        allx, nbrw, sorw = comm.recv(source=0, tag=2)
        if allx:
            labels = []
            for k in range(3):
                labels_z, labels_y, labels_x = comm.recv(source=0, tag=k+3)
                labels_tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32)
                comm.Recv([labels_tmp, MPI.INT], source=0, tag=k+6)
                labels.append(labels_tmp)
        else:
            labels_z, labels_y, labels_x = comm.recv(source=0, tag=3)
            labels = np.empty((labels_z, labels_y, labels_x), dtype=np.int32)
            comm.Recv([labels, MPI.INT], source=0, tag=6)
        indices = comm.recv(source=0, tag=9)
        indices_child = comm.recv(source=0, tag=10)

        # init cuda device
        cuda.init()
        dev = cuda.Device(rank)
        ctx = dev.make_context()

        # select the desired script
        if allx:
            from pycuda_small_allx import walk
        else:
            from pycuda_small import walk

        # run random walks
        tic = time.time()
        walkmap = walk(data, labels, indices, indices_child, nbrw, sorw, name)
        tac = time.time()
        print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds')

        # free device
        ctx.pop()
        del ctx

        # send data
        for k in range(walkmap.shape[0]):
            datatemporaer = np.copy(walkmap[k])
            comm.Barrier()
            comm.Reduce([datatemporaer, MPI.FLOAT], None, root=0, op=MPI.SUM)

Example #4

Show file

def walk(comm, raw, slices, indices, nbrw, sorw, blockmin, blockmax, name,
         allLabels, smooth, uncertainty):

    rank = comm.Get_rank()
    size = comm.Get_size()

    if raw.dtype == 'uint8':
        kernel = _build_kernel_int8()
        raw = (raw - 128).astype('int8')
    else:
        kernel = _build_kernel_float32()
        raw = raw.astype(np.float32)

    foundAxis = [0] * 3
    for k in range(3):
        if indices[k]:
            foundAxis[k] = 1

    zsh, ysh, xsh = raw.shape
    fill_gpu = _build_kernel_fill()

    block = (32, 32, 1)
    x_grid = (xsh // 32) + 1
    y_grid = (ysh // 32) + 1
    grid2 = (int(x_grid), int(y_grid), int(zsh))

    a = np.empty(raw.shape, dtype=np.float32)
    final = np.zeros((blockmax - blockmin, ysh, xsh), dtype=np.uint8)
    segment_npy = np.empty(1, dtype=np.uint8)

    memory_error = False

    try:
        raw_gpu = gpuarray.to_gpu(raw)
        a_gpu = cuda.mem_alloc(a.nbytes)

        if smooth:
            update_gpu = _build_update_gpu()
            curvature_gpu = _build_curvature_gpu()
            b_gpu = gpuarray.zeros(raw.shape, dtype=np.float32)

        zshape = np.int32(zsh)
        yshape = np.int32(ysh)
        xshape = np.int32(xsh)
        sorw = np.int32(sorw)
        nbrw = np.int32(nbrw)

        slshape = [None] * 3
        indices_gpu = [None] * 3
        beta_gpu = [None] * 3
        slices_gpu = [None] * 3
        ysh = [None] * 3
        xsh = [None] * 3

        for k, found in enumerate(foundAxis):
            if found:
                indices_tmp = np.array(indices[k], dtype=np.int32)
                slices_tmp = slices[k].astype(np.int32)
                slices_tmp = reduceBlocksize(slices_tmp)
                slshape[k], ysh[k], xsh[k] = slices_tmp.shape
                indices_gpu[k] = gpuarray.to_gpu(indices_tmp)
                slices_gpu[k] = gpuarray.to_gpu(slices_tmp)
                Beta = np.zeros(slices_tmp.shape, dtype=np.float32)
                for m in range(slshape[k]):
                    for n in allLabels:
                        A = _calc_label_walking_area(slices_tmp[m], n)
                        plane = indices_tmp[m]
                        if k == 0: raw_tmp = raw[plane]
                        if k == 1: raw_tmp = raw[:, plane]
                        if k == 2: raw_tmp = raw[:, :, plane]
                        Beta[m] += _calc_var(raw_tmp.astype(float), A)
                beta_gpu[k] = gpuarray.to_gpu(Beta)

        sendbuf = np.zeros(1, dtype=np.int32)
        recvbuf = np.zeros(1, dtype=np.int32)
        comm.Barrier()
        comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX)

    except Exception as e:
        print('Error: GPU out of memory. Data too large.')
        sendbuf = np.zeros(1, dtype=np.int32) + 1
        recvbuf = np.zeros(1, dtype=np.int32)
        comm.Barrier()
        comm.Allreduce([sendbuf, MPI.INT], [recvbuf, MPI.INT], op=MPI.MAX)

    if recvbuf > 0:
        memory_error = True
        try:
            a_gpu.free()
        except:
            pass
        return memory_error, None, None, None

    if smooth:
        try:
            update_gpu = _build_update_gpu()
            curvature_gpu = _build_curvature_gpu()
            b_npy = np.zeros(raw.shape, dtype=np.float32)
            b_gpu = cuda.mem_alloc(b_npy.nbytes)
            cuda.memcpy_htod(b_gpu, b_npy)
            final_smooth = np.zeros((blockmax - blockmin, yshape, xshape),
                                    dtype=np.uint8)
            sendbuf_smooth = np.zeros(1, dtype=np.int32)
            recvbuf_smooth = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_smooth, MPI.INT],
                           [recvbuf_smooth, MPI.INT],
                           op=MPI.MAX)
        except Exception as e:
            print(
                'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.'
            )
            sendbuf_smooth = np.zeros(1, dtype=np.int32) + 1
            recvbuf_smooth = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_smooth, MPI.INT],
                           [recvbuf_smooth, MPI.INT],
                           op=MPI.MAX)
        if recvbuf_smooth > 0:
            smooth = 0
            try:
                b_gpu.free()
            except:
                pass

    if uncertainty:
        try:
            max_npy = np.zeros((3, ) + raw.shape, dtype=np.float32)
            max_gpu = cuda.mem_alloc(max_npy.nbytes)
            cuda.memcpy_htod(max_gpu, max_npy)
            kernel_uncertainty = _build_kernel_uncertainty()
            kernel_max = _build_kernel_max()
            sendbuf_uq = np.zeros(1, dtype=np.int32)
            recvbuf_uq = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT],
                           op=MPI.MAX)
        except Exception as e:
            print(
                'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.'
            )
            sendbuf_uq = np.zeros(1, dtype=np.int32) + 1
            recvbuf_uq = np.zeros(1, dtype=np.int32)
            comm.Barrier()
            comm.Allreduce([sendbuf_uq, MPI.INT], [recvbuf_uq, MPI.INT],
                           op=MPI.MAX)
        if recvbuf_uq > 0:
            uncertainty = False
            try:
                max_gpu.free()
            except:
                pass

    for label_counter, segment in enumerate(allLabels):
        print('%s:' % (name) + ' ' + str(label_counter + 1) + '/' +
              str(len(allLabels)))
        fill_gpu(a_gpu, xshape, yshape, block=block, grid=grid2)
        segment_gpu = np.int32(segment)
        segment_npy.fill(segment)
        for k, found in enumerate(foundAxis):
            if found:
                axis_gpu = np.int32(k)
                x_grid = (xsh[k] // 32) + 1
                y_grid = (ysh[k] // 32) + 1
                grid = (int(x_grid), int(y_grid), int(slshape[k]))
                kernel(axis_gpu,
                       segment_gpu,
                       raw_gpu,
                       slices_gpu[k],
                       a_gpu,
                       xshape,
                       yshape,
                       zshape,
                       indices_gpu[k],
                       sorw,
                       beta_gpu[k],
                       nbrw,
                       block=block,
                       grid=grid)
        cuda.memcpy_dtoh(a, a_gpu)

        if size > 1:
            a = sendrecv(a, blockmin, blockmax, comm, rank, size)

        if smooth or uncertainty:
            cuda.memcpy_htod(a_gpu, a)

        if uncertainty:
            kernel_max(max_gpu, a_gpu, xshape, yshape, block=block, grid=grid2)

        if smooth:
            for k in range(smooth):
                curvature_gpu(a_gpu,
                              b_gpu,
                              xshape,
                              yshape,
                              block=block,
                              grid=grid2)
                update_gpu(a_gpu,
                           b_gpu,
                           xshape,
                           yshape,
                           block=block,
                           grid=grid2)
            a_smooth = np.empty_like(a)
            cuda.memcpy_dtoh(a_smooth, a_gpu)
            if label_counter == 0:
                a_smooth[a_smooth < 0] = 0
                walkmap_smooth = np.copy(a_smooth)
            else:
                walkmap_smooth, final_smooth = max_to_label(
                    a_smooth, walkmap_smooth, final_smooth, blockmin, blockmax,
                    segment)

        if label_counter == 0:
            a[a < 0] = 0
            walkmap = np.copy(a)
        else:
            walkmap, final = max_to_label(a, walkmap, final, blockmin,
                                          blockmax, segment)

    if uncertainty:
        kernel_uncertainty(max_gpu,
                           a_gpu,
                           xshape,
                           yshape,
                           block=block,
                           grid=grid2)
        final_uncertainty = np.empty_like(a)
        cuda.memcpy_dtoh(final_uncertainty, a_gpu)
        final_uncertainty = final_uncertainty[blockmin:blockmax]
    else:
        final_uncertainty = None

    if not smooth:
        final_smooth = None

    try:
        a_gpu.free()
    except:
        pass

    return memory_error, final, final_uncertainty, final_smooth