Exemple #1
0
 def init_output(self): 
   out_shape = self.get_output_shape()
   rows = int(np.prod(out_shape[:3]))
   cols = out_shape[3]
   #util.log('Layer %s allocating: (%s,%s) ', self.name, rows, cols)
   self.output = gpuarray.GPUArray((rows, cols), dtype=np.float32)
   self.output_grad = gpuarray.GPUArray((rows, cols), dtype=np.float32)
Exemple #2
0
    def _gpu_init(self, debug):
        self.dev_n = cuda.to_gpu_async(np.array([self.n]).astype(np.int32),
                                       stream=self.stream2)

        self.neighbors_index = cuda.to_gpu_async(self.num_neighbors,
                                                 stream=self.stream1)
        exclusiveScan(self.neighbors_index, stream=self.stream1)

        self.dev_num_neighbors = cuda.to_gpu_async(self.num_neighbors,
                                                   stream=self.stream2)

        self.dev_states = cuda.to_gpu_async(self.states, stream=self.stream2)
        self.dev_waypoints = cuda.to_gpu_async(self.waypoints,
                                               stream=self.stream2)

        self.dev_neighbors = cuda.to_gpu_async(self.neighbors,
                                               stream=self.stream2)

        self.dev_Gindicator = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype)
        self.dev_xindicator = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype)

        self.dev_xindicator_zeros = cuda.GPUArray(self.Vopen.shape,
                                                  self.Vopen.dtype)

        self.zero_val = np.zeros((), np.int32)

        self.dev_xindicator_zeros.fill(self.zero_val, stream=self.stream1)

        # self.stream1.synchronize()
        self.stream2.synchronize()
Exemple #3
0
def test_exec_raises_on_dtype():
    dtype = np.float32
    complex_dtype = np.complex64

    M = 4096
    tol = 1e-3
    shape = (16, 16, 16)
    dim = len(shape)

    kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
    c = utils.gen_nonuniform_data(M).astype(complex_dtype)
    c_gpu = gpuarray.to_gpu(c)
    # Using c.real gives us wrong dtype here...
    c_gpu_wrong_dtype = gpuarray.to_gpu(c.real)

    kxyz_gpu = gpuarray.to_gpu(kxyz)
    fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
    # Here we'll intentionally contruct an incorrect array dtype.
    fk_gpu_wrong_dtype = gpuarray.GPUArray(shape, dtype=np.complex128)

    plan = cufinufft(1, shape, eps=tol, dtype=dtype)

    plan.set_pts(kxyz_gpu[0], kxyz_gpu[1], kxyz_gpu[2])

    with pytest.raises(TypeError):
        plan.execute(c_gpu, fk_gpu_wrong_dtype)

    with pytest.raises(TypeError):
        plan.execute(c_gpu_wrong_dtype, fk_gpu)
Exemple #4
0
def fun_load(config, sock_data=5000):

    send_queue = config['queue_l2t']
    recv_queue = config['queue_t2l']
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    # if need to do random crop and mirror
    flag_randproc = not config['use_data_layer']
    flag_batch = config['batch_crop_mirror']

    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)
    sock.bind('tcp://*:{0}'.format(sock_data))

    shape, dtype, h = sock.recv_pyobj()
    print 'shared_x information received'

    gpu_data_remote = gpuarray.GPUArray(shape,
                                        dtype,
                                        gpudata=drv.IPCMemoryHandle(h))
    gpu_data = gpuarray.GPUArray(shape, dtype)

    img_mean = recv_queue.get()
    print 'img_mean received'

    # The first time, do the set ups and other stuff

    # receive information for loading

    while True:
        # getting the hkl file name to load
        hkl_name = recv_queue.get()

        # print hkl_name
        data = hkl.load(hkl_name) - img_mean
        # print 'load ', time.time() - bgn_time

        if flag_randproc:
            param_rand = recv_queue.get()

            data = crop_and_mirror(data, param_rand, flag_batch=flag_batch)

        gpu_data.set(data)

        # wait for computation on last minibatch to finish
        msg = recv_queue.get()
        assert msg == 'calc_finished'

        drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr,
                        gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx)

        ctx.synchronize()

        send_queue.put('copy_finished')
def fun_load(config, sock_data_2=5001):
    send_queue = config['queue_l2t']
    recv_queue = config['queue_t2l']
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    num_timesteps = config['num_timesteps']
    num_seq = config['num_seq']
    img_scale_x = config['img_scale_x']
    img_scale_y = config['img_scale_y']
    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx_2 = dev.make_context()

    sock_2 = zmq.Context().socket(zmq.PAIR)
    sock_2.bind('tcp://*:{0}'.format(sock_data_2))
    shape_temporal, dtype_temporal, h_temporal = sock_2.recv_pyobj()
    print 'shared_x information received', shape_temporal
    gpu_data_remote_temporal = gpuarray.GPUArray(
        shape_temporal,
        dtype_temporal,
        gpudata=drv.IPCMemoryHandle(h_temporal))
    gpu_data_temporal = gpuarray.GPUArray(shape_temporal, dtype_temporal)
    # print 'img_mean received'
    # The first time, do the set ups and other stuff
    # receive information for loading
    while True:
        video_name_temporal = recv_queue.get()
        rand_param = recv_queue.get()
        if config['modal'] == 'rgb':
            data_temporal = prepare_data_rgb(video_name_temporal,
                                             num_timesteps,
                                             num_seq,
                                             rand_param,
                                             data_shape=(img_scale_x,
                                                         img_scale_y, 3))
        else:
            data_temporal = prepare_data_flow(video_name_temporal,
                                              num_timesteps,
                                              num_seq,
                                              rand_param,
                                              data_shape=(img_scale_x,
                                                          img_scale_y))

        gpu_data_temporal.set(data_temporal)
        # wait for computation on last minibatch to finish
        msg = recv_queue.get()
        assert msg == 'calc_finished'
        drv.memcpy_peer(
            gpu_data_remote_temporal.ptr, gpu_data_temporal.ptr,
            gpu_data_temporal.dtype.itemsize * gpu_data_temporal.size, ctx_2,
            ctx_2)

        ctx_2.synchronize()
        send_queue.put('copy_finished')
Exemple #6
0
        def thunk():
            x, truth = inputs[0], inputs[1]
            context = None
            if hasattr(x[0], 'context'):
                context = x[0].context
            z = outputs[0]
            z_shape = x[0].shape
            if z[0] is None or z[0].shape != z_shape:
                z[0] = pygpu.zeros(z_shape,
                                   dtype=theano.config.floatX,
                                   context=context)
            x_ptr, _ = get_tens_ptr(x[0])
            truth_ptr, _ = get_tens_ptr(truth[0])
            z_ptr, z_obj = get_tens_ptr(z[0])

            # store as gpuarray
            best_idx_ptr = gpuarray.GPUArray(shape=(np.prod(
                truth[0].shape[:2]), ),
                                             dtype=np.int32)
            best_iou_ptr = gpuarray.GPUArray(shape=(np.prod(
                truth[0].shape[:2]), ),
                                             dtype=np.float32)

            yolo_ptr, _ = get_yolo_info(n_classes, n_anchors, l_obj, l_noobj,
                                        anchors)

            # get best index
            index_fn(best_idx_ptr,
                     best_iou_ptr,
                     x_ptr,
                     truth_ptr,
                     yolo_ptr,
                     block=(1, 1, 1),
                     grid=(x[0].shape[0], 1, 1))

            n_total = np.int32(x[0].shape[0] * n_anchors *
                               np.prod(x[0].shape[-2:]))
            n_matched = np.int32(gpuarray.sum(best_idx_ptr != -1).get())

            grad_fn(z_ptr,
                    best_idx_ptr,
                    best_iou_ptr,
                    x_ptr,
                    truth_ptr,
                    yolo_ptr,
                    n_matched,
                    n_total,
                    block=(n_anchors, 1, 1),
                    grid=(x[0].shape[0], x[0].shape[2], x[0].shape[3]))

            # free all memory
            del best_idx_ptr
            del best_iou_ptr
            yolo_ptr.free()
Exemple #7
0
def frexp(arg, stream=None):
    """Return a tuple `(significands, exponents)` such that 
    `arg == significand * 2**exponent`.
    """
    sig = gpuarray.GPUArray(arg.shape, arg.dtype)
    expt = gpuarray.GPUArray(arg.shape, arg.dtype)

    func = elementwise.get_frexp_kernel()
    func.set_block_shape(*arg._block)
    func.prepared_async_call(arg._grid, stream, arg.gpudata, sig.gpudata,
                             expt.gpudata, arg.mem_size)

    return sig, expt
Exemple #8
0
def modf(arg, stream=None):
    """Return a tuple `(fracpart, intpart)` of arrays containing the
    integer and fractional parts of `arg`. 
    """

    intpart = gpuarray.GPUArray(arg.shape, arg.dtype)
    fracpart = gpuarray.GPUArray(arg.shape, arg.dtype)

    func = elementwise.get_modf_kernel()
    func.set_block_shape(*arg._block),
    func.prepared_async_call(arg._grid, stream, arg.gpudata, intpart.gpudata,
                             fracpart.gpudata, arg.mem_size)

    return fracpart, intpart
Exemple #9
0
    def prepare_server(self):

        self.g_param_list = self.param_list
        self.g_param_ga_list = []
        self.w_param_ga_list = []
        self.w_param_list = []

        for param in self.param_list:
            np_param = param.get_value()
            w_param = theano.shared(np_param)
            self.w_param_list.append(w_param)
            w_param_ga = gpuarray.GPUArray(np_param.shape, np_param.dtype)
            self.w_param_ga_list.append(w_param_ga)
            g_param_ga = gpuarray.GPUArray(np_param.shape, np_param.dtype)
            self.g_param_ga_list.append(g_param_ga)
Exemple #10
0
def test_type2(shape=(16, 16, 16), M=4096, tol=1e-3):
    kxyz = utils.gen_nu_pts(M)
    fk = utils.gen_uniform_data(shape)

    kxyz_gpu = gpuarray.to_gpu(kxyz)
    fk_gpu = gpuarray.to_gpu(fk)

    c_gpu = gpuarray.GPUArray(shape=(M,), dtype=np.complex64)

    plan = cufinufft.plan(2, shape, -1, tol)

    cufinufft.set_nu_pts(plan, M, kxyz_gpu[0].gpudata, kxyz_gpu[1].gpudata,
            kxyz_gpu[2].gpudata)

    cufinufft.execute(plan, c_gpu.gpudata, fk_gpu.gpudata)

    cufinufft.destroy(plan)

    c = c_gpu.get()

    ind = M // 2

    c_est = c[ind]
    c_target = utils.direct_type2(fk, kxyz[:, ind])

    type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target)

    print('Type 2 relative error:', type2_rel_err)
Exemple #11
0
def test_type1(shape=(16, 16, 16), M=4096, tol=1e-3):
    kxyz = utils.gen_nu_pts(M)
    c = utils.gen_nonuniform_data(M)

    kxyz_gpu = gpuarray.to_gpu(kxyz)
    c_gpu = gpuarray.to_gpu(c)
    fk_gpu = gpuarray.GPUArray(shape, dtype=np.complex64)

    plan = cufinufft.plan(1, shape, 1, tol)

    cufinufft.set_nu_pts(plan, M, kxyz_gpu[0].gpudata, kxyz_gpu[1].gpudata,
            kxyz_gpu[2].gpudata)

    cufinufft.execute(plan, c_gpu.gpudata, fk_gpu.gpudata)

    fk = fk_gpu.get()

    ind = int(0.1789 * np.prod(shape))

    fk_est = fk.ravel()[ind]
    fk_target = utils.direct_type1(c, kxyz, shape, ind)

    type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)

    print('Type 1 relative error:', type1_rel_err)
Exemple #12
0
def gpu_tensor_gemm(handle, a, b):
    c = gpuarray.GPUArray((a.shape[0], b.shape[1]), dtype=a.dtype)
    # c.fill(0)
    # driver.memset_d16(c.gpudata, 15360, c.mem_size)
    # print(c.get())
    cublas_dot.cublas_gemm(handle, a, b, c)
    return c
Exemple #13
0
def _test_type1(dtype, shape=(16, 16, 16), M=4096, tol=1e-3):
    complex_dtype = utils._complex_dtype(dtype)

    dim = len(shape)

    k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
    c = utils.gen_nonuniform_data(M).astype(complex_dtype)

    k_gpu = gpuarray.to_gpu(k)
    c_gpu = gpuarray.to_gpu(c)
    fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)

    plan = cufinufft(1, shape, eps=tol, dtype=dtype)

    plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2])

    plan.execute(c_gpu, fk_gpu)

    fk = fk_gpu.get()

    ind = int(0.1789 * np.prod(shape))

    fk_est = fk.ravel()[ind]
    fk_target = utils.direct_type1(c, k, shape, ind)

    type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)

    print('Type 1 relative error:', type1_rel_err)

    assert type1_rel_err < 0.01
def ones(shape: tuple,
         dtype: np.dtype,
         order: str = 'C',
         allocator=drv.mem_alloc):
    """
    Return an array of the given shape and dtype filled with ones.

    Parameters
    ----------
    shape : tuple
        Array shape.
    dtype : data-type
        Data type for the array.
    order : {'C', 'F'}, optional
        Create array using row-major or column-major format.
    allocator : callable, optional
        Returns an object that represents the memory allocated for
        the requested array.

    Returns
    -------
    out : pycuda.gpuarray.GPUArray
        Array of ones with the given shape, dtype, and order.
    """

    out = gpuarray.GPUArray(shape, dtype, allocator, order=order)
    o = np.ones((), dtype)
    out.fill(o)
    return out
Exemple #15
0
def _test_type2(dtype, shape=(16, 16, 16), M=4096, tol=1e-3):
    complex_dtype = utils._complex_dtype(dtype)

    k = utils.gen_nu_pts(M).astype(dtype)
    fk = utils.gen_uniform_data(shape).astype(complex_dtype)

    k_gpu = gpuarray.to_gpu(k)
    fk_gpu = gpuarray.to_gpu(fk)

    c_gpu = gpuarray.GPUArray(shape=(M, ), dtype=complex_dtype)

    plan = cufinufft(2, shape, eps=tol, dtype=dtype)

    plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2])

    plan.execute(c_gpu, fk_gpu)

    c = c_gpu.get()

    ind = M // 2

    c_est = c[ind]
    c_target = utils.direct_type2(fk, k[:, ind])

    type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target)

    print('Type 2 relative error:', type2_rel_err)

    assert type2_rel_err < 0.01
Exemple #16
0
def zeros(shape, dtype, allocator=drv.mem_alloc):
    """
    Return an array of the given shape and dtype filled with zeros.

    Parameters
    ----------
    shape : tuple
        Array shape.
    dtype : data-type
        Data type for the array.
    allocator : callable
        Returns an object that represents the memory allocated for
        the requested array.

    Returns
    -------
    out : pycuda.gpuarray.GPUArray
        Array of zeros with the given shape and dtype.

    Notes
    -----
    This function exists to work around the following numpy bug that
    prevents pycuda.gpuarray.zeros() from working properly with
    complex types in pycuda 2011.1.2:
    http://projects.scipy.org/numpy/ticket/1898

    """

    out = gpuarray.GPUArray(shape, dtype, allocator)
    out.fill(0)
    return out
Exemple #17
0
def resize_gpu(src_vol, dst_vol=None, dst_shape=None, scaling=None):

    if dst_shape is None:
        assert scaling
        dst_shape = [np.int(np.round(i * scaling)) for i in src_vol.shape]

    if dst_vol is None:
        dst_vol = gpuarray.GPUArray(dst_shape, np.float32)

    ndarray_to_float_tex(_tex_ref, src_vol)

    block = (32, 32, 1)
    grid = (int(divup(dst_vol.shape[2], block[0])),
            int(divup(dst_vol.shape[1], block[1])), 1)

    _kernel(dst_vol,
            np.int32(src_vol.shape[2]),
            np.int32(src_vol.shape[1]),
            np.int32(src_vol.shape[0]),
            np.int32(dst_vol.shape[2]),
            np.int32(dst_vol.shape[1]),
            np.int32(dst_vol.shape[0]),
            grid=grid, block=block)

    return dst_vol
Exemple #18
0
    def _FarnebackUpdateFlow_GaussianBlur_gpu(self, poly_coefficients0,
                                              poly_coefficients1, flow_gpu, M,
                                              winsize, update_matrices):
        sigma = self.winsize * 0.3

        M_filtered_gpu = gpuarray.GPUArray(M.shape, M.dtype)

        for i in range(M.shape[0]):
            farneback3d._filtering.smooth_cuda_gauss(M[i],
                                                     sigma,
                                                     winsize,
                                                     rtn_gpu=M_filtered_gpu[i])

        block = (32, 32, 1)
        grid = (int(divup(flow_gpu.shape[3],
                          block[0])), int(divup(flow_gpu.shape[2],
                                                block[1])), 1)

        self._solve_equations_kernel(M_filtered_gpu,
                                     flow_gpu,
                                     np.int32(flow_gpu.shape[3]),
                                     np.int32(flow_gpu.shape[2]),
                                     np.int32(flow_gpu.shape[1]),
                                     block=block,
                                     grid=grid)

        if update_matrices:
            self._FarnebackUpdateMatrices_gpu(poly_coefficients0,
                                              poly_coefficients1, flow_gpu, M)
Exemple #19
0
def test_opts(shape=(8, 8, 8), M=32, tol=1e-3):
    dtype = np.float32

    complex_dtype = utils._complex_dtype(dtype)

    dim = len(shape)

    k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
    c = utils.gen_nonuniform_data(M).astype(complex_dtype)

    k_gpu = gpuarray.to_gpu(k)
    c_gpu = gpuarray.to_gpu(c)
    fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)

    plan = cufinufft(1,
                     shape,
                     eps=tol,
                     dtype=dtype,
                     gpu_sort=False,
                     gpu_maxsubprobsize=10)

    plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2])

    plan.execute(c_gpu, fk_gpu)

    fk = fk_gpu.get()

    ind = int(0.1789 * np.prod(shape))

    fk_est = fk.ravel()[ind]
    fk_target = utils.direct_type1(c, k, shape, ind)

    type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)

    assert type1_rel_err < 0.01
Exemple #20
0
    def run(self):
        self.dev = cuda.Device(self.devID)
        self.ctx = self.dev.make_context()
        global code
        global cdata
        numThreads = 1024
        numBlocks = 30
        N = numThreads * numBlocks
        mod = SourceModule(code % {"NGENERATORS": N},
                           no_extern_c=True,
                           arch='sm_61')
        init_func = mod.get_function("initkernel")
        fill_func = mod.get_function("randfillkernel")
        init_func(np.int32(time.time()),
                  block=(numThreads, 1, 1),
                  grid=(numBlocks, 1, 1))

        split = ceil(self.hptr.size / float(2000000000))
        split_data = np.array_split(self.hptr, split)
        gdata = gpuarray.GPUArray(shape=split_data[0].size, dtype=np.float32)
        for data in split_data:
            fill_func(gdata,
                      np.uint64(gdata.size),
                      block=(numThreads, 1, 1),
                      grid=(numBlocks, 1, 1))
            gdata.get(data)

        self.ctx.pop()
        del self.ctx
Exemple #21
0
def prepare_kernels(files, templates, constvars, blockvars={}):
    """ Compile and prepare CUDA kernel functions

    Args:
        files : list of cuda source file handles
        templates : list of tuples describing the kernels
        constvars : dict of readonly variables
        blockvars : dict of blockvars whose description will be included
                    in the preamble as preprocessor macros
    Returns:
        kernels : dict of executable CUDA kernels
    """
    preamble, constvars = prepare_vars(constvars, blockvars)
    kernels_code = preamble
    for f in files:
        kernels_code += f.read().decode("utf-8")

    mod = compiler.SourceModule(kernels_code)
    kernels = {}
    for d in templates:
        kernels[d[0]] = prepare_kernelfun(mod, *d)

    for name, val in constvars.items():
        const_ptr, size_in_bytes = mod.get_global(name)
        pycuda.driver.memcpy_htod(const_ptr, val)
        # WARNING: The gpudata argument in gpuarray.GPUArray usually requires a
        # pycuda.driver.DeviceAllocation and const_ptr is an int generated from
        # casting a CUdeviceptr to an int.
        # However, since DeviceAllocation is a simple wrapper around CUdeviceptr
        # (that gives a CUdeviceptr when cast to an int), it works like this.
        constvars[name] = gpuarray.GPUArray(val.shape,
                                            val.dtype,
                                            gpudata=const_ptr)

    return kernels
Exemple #22
0
def copy_torch2glumpy(dst: gloo.VertexBuffer, src: torch.cuda.FloatTensor):
    # torch 2 pycuda
    src = gp.GPUArray(src.shape, np.float32, gpudata=src.data_ptr())

    # copy pycuda 2 glumpy
    copy_pycuda2glumpy(dst, src)
    return
Exemple #23
0
def copy_pycuda2RegisteredBuffer(dst: pycuda.gl.RegisteredBuffer,
                                 src: gp.GPUArray):
    # pycuda 2 RegisteredBuffer
    with cuda_activate(dst) as ptr:
        dst = gp.GPUArray(src.shape, np.float32, gpudata=ptr)
        dst[:] = src
    return
Exemple #24
0
def ones_cuda(shape):
    """Create GPUArray of ones directly on GPU memory.

    Parameters
    ----------
    shape : tuple
        Dimensions of the GPUArray.

    Returns
    -------
    gpuarray
        GPUArray of ones.

    Examples
    --------
    >>> a = ones_cuda((3, 2))
    [[ 1.,  1.],
     [ 1.,  1.],
     [ 1.,  1.]]

    >>> type(a)
    <class 'pycuda.gpuarray.GPUArray'>
    """
    a = cuda_array.GPUArray(shape, dtype=float32, allocator=pycuda.driver.mem_alloc, order='C')
    a.fill(1.0)
    return a
Exemple #25
0
def frexp(arg, stream=None):
    """Return a tuple `(significands, exponents)` such that
    `arg == significand * 2**exponent`.
    """
    if not arg.flags.forc:
        raise RuntimeError("only contiguous arrays may "
                           "be used as arguments to this operation")

    sig = gpuarray.GPUArray(arg.shape, arg.dtype)
    expt = gpuarray.GPUArray(arg.shape, arg.dtype)

    func = elementwise.get_frexp_kernel()
    func.prepared_async_call(arg._grid, arg._block, stream, arg.gpudata,
                             sig.gpudata, expt.gpudata, arg.mem_size)

    return sig, expt
Exemple #26
0
def squared_loss(y_true, y_pred):
    """Compute the squared loss for regression.

    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) values.

    y_pred : array-like or label indicator matrix
        Predicted values, as returned by a regression estimator.

    Returns
    -------
    loss : float
        The degree to which the samples are correctly predicted.
    """
    tmp_gpu = gpuarray.GPUArray(y_true.shape, y_true.dtype)
    if y_true.dtype == np.float64:
        cuSquaredError(y_true.gpudata,
                       y_pred.gpudata,
                       tmp_gpu.gpudata,
                       np.int32(y_true.size),
                       block=(blockSize, 1, 1),
                       grid=(int((y_true.size - 1) / blockSize + 1), 1, 1))
    else:
        cuSquaredErrorf(y_true.gpudata,
                        y_pred.gpudata,
                        tmp_gpu.gpudata,
                        np.int32(y_true.size),
                        block=(blockSize, 1, 1),
                        grid=(int((y_true.size - 1) / blockSize + 1), 1, 1))
    mean = float(cumisc.mean(tmp_gpu).get())
    return (mean / 2)
Exemple #27
0
def modf(arg, stream=None):
    """Return a tuple `(fracpart, intpart)` of arrays containing the
    integer and fractional parts of `arg`.
    """
    if not arg.flags.forc:
        raise RuntimeError("only contiguous arrays may "
                           "be used as arguments to this operation")

    intpart = gpuarray.GPUArray(arg.shape, arg.dtype)
    fracpart = gpuarray.GPUArray(arg.shape, arg.dtype)

    func = elementwise.get_modf_kernel()
    func.prepared_async_call(arg._grid, arg._block, stream, arg.gpudata,
                             intpart.gpudata, fracpart.gpudata, arg.mem_size)

    return fracpart, intpart
Exemple #28
0
    def alloc_buf(self, size=None, like=None, wrap_in_array=False):
        if like is not None:
            # When calculating the total array size, take into account
            # any striding.
            # XXX: why does it even work?
            buf_size = like.shape[0] * like.strides[0]
            buf = cuda.mem_alloc(buf_size)
            self._total_memory_bytes += buf_size

            if like.base is not None and type(
                    like.base) is not cuda.PagelockedHostAllocation:
                self.buffers[buf] = like.base
            else:
                self.buffers[buf] = like

            self.to_buf(buf)
            if wrap_in_array:
                self.arrays[buf] = cudaarray.GPUArray(like.shape,
                                                      like.dtype,
                                                      gpudata=buf)
        else:
            self._total_memory_bytes += size
            buf = cuda.mem_alloc(size)

        return buf
Exemple #29
0
def get_gpuarray(bh_ary):
    """Return a PyCUDA GPUArray object that points to the same device memory as `bh_ary`.

    Parameters
    ----------
    bh_ary : ndarray (Bohrium array)
        Must be a Bohrium base array

    Returns
    -------
    out : GPUArray

    Notes
    -----
    Changing or deallocating `bh_ary` invalidates the returned GPUArray array!

    """

    if get_base(bh_ary) is not bh_ary:
        raise RuntimeError('`bh_ary` must be a base array and not a view')
    assert (bh_ary.bhc_mmap_allocated)

    with contexts.DisableBohrium():
        _import_pycuda_module()
        from pycuda import gpuarray
        dev_ptr = get_data_pointer(get_base(bh_ary),
                                   copy2host=False,
                                   allocate=True)
        return gpuarray.GPUArray(bh_ary.shape, bh_ary.dtype, gpudata=dev_ptr)
Exemple #30
0
 def array(self, shape, dtype, strides=None, allocator=None):
     # In PyCUDA, the default allocator is not None, but a default alloc object
     kwds = {}
     if strides is not None:
         kwds['strides'] = strides
     if allocator is not None:
         kwds['allocator'] = allocator
     return gpuarray.GPUArray(shape, dtype, **kwds)