Exemple #1
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_sm = node.outputs[0].dtype
        load_x = load_w(node.inputs[0].dtype)
        load_b = load_w(node.inputs[1].dtype)
        write_sm = write_w(node.outputs[0].dtype)
        work_sm = work_dtype(node.outputs[0].dtype)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_acc = gpuarray.dtype_to_ctype(work_sm)

        ctype = gpuarray.dtype_to_ctype(work_sm)

        params = [
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
        ]
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
        code = ("""#include "cluda.h"

        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
        {
            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
            LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
                    buf[tx] += %(load_b)s(b[tx * sb0]);
                    buf2[tx] = buf[tx];
                }
                local_barrier();
                {
                    // This function trashes buf[1..GA_WARP_SIZE],
                    // leaving the reduction result in buf[0].
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
                        {
                            buf[LID_0] = max(buf[LID_0], buf[i]);
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
                        local_barrier();
                    }
                }
                %(ctype)s row_max = buf[0];
                local_barrier();
                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
                    buf[__i] = exp(buf2[__i] - row_max);
                    buf2[__i] = buf[__i];
                }
                local_barrier();
                {
                    // This function trashes buf[1..GA_WARP_SIZE],
                    // leaving the reduction result in buf[0].
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
                        {
                            buf[LID_0] = buf[LID_0] + buf[i];
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
                        local_barrier();
                    }
                }
                %(ctype)s row_sum = buf[0];
                local_barrier();
                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
                    buf[__i] = buf2[__i] / row_sum;
                }
                local_barrier();
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
                }
                local_barrier();
            }
        }
        """ % locals())
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
        code = ("""#include "cluda.h"

        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
        {
            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
                GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
                GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
                {
                    // This function trashes buf[1..n_threads],
                    // leaving the reduction result in buf[0].
                    %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]);
                    #pragma unroll 16
                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
                        red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]));
                    }
                    buf[LID_0] = red;
                    local_barrier();
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
                            buf[LID_0] = max(buf[LID_0], buf[i]);
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
                        local_barrier();
                    }
                }
                %(ctype)s row_max = buf[0];
                local_barrier();
                {
                    // This function trashes buf[1..n_threads],
                    // leaving the reduction result in buf[0].
                    %(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max);
                    #pragma unroll 16
                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
                    red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max);
                    }
                    buf[LID_0] = red;
                    local_barrier();
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
                            buf[LID_0] = buf[LID_0] + buf[i];
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
                        local_barrier();
                    }
                }
                %(ctype)s row_sum = buf[0];
                local_barrier();
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum);
                }
                local_barrier();
            }
        }
        """ % locals())
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        return kernels
Exemple #2
0
    def c_code(self, node, nodename, inp, out, sub):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_z = node.outputs[0].dtype
        work_x = work_dtype(dtype_x)
        itemsize_x = np.dtype(dtype_x).itemsize
        itemsize_b = np.dtype(dtype_b).itemsize
        itemsize_z = np.dtype(dtype_z).itemsize
        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        x, b = inp
        (z, ) = out
        fail = sub["fail"]
        ctx = sub["params"]
        err_check = ("""
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
                %(fail)s;
            }
        """ % locals())
        return ("""
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
            PyErr_SetString(PyExc_ValueError, "rank error input");
            %(fail)s;
        }
        if (PyGpuArray_NDIM(%(b)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "rank error for the bias");
            %(fail)s;
        }
        if ((PyGpuArray_DIMS(%(x)s)[1] !=
            PyGpuArray_DIMS(%(b)s)[0]))
        {
            PyErr_Format(PyExc_ValueError,
                         "number of columns in x (%%ld)"
                         " does not match length of b (%%ld)",
                         (long int)PyGpuArray_DIMS(%(x)s)[1],
                         (long int)PyGpuArray_DIMS(%(b)s)[0]);
            %(fail)s;
        }
        if ((NULL == %(z)s)
            || (PyGpuArray_DIMS(%(z)s)[0] !=
                PyGpuArray_DIMS(%(x)s)[0])
            || (PyGpuArray_DIMS(%(z)s)[1] !=
                PyGpuArray_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
                                %(typecode)s, GA_C_ORDER,
                                %(ctx)s, Py_None);
            if (!%(z)s) {
                %(fail)s
            }
        }
        {
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
//TODO, detect the maximum number of thread per block.
            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
            ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
            ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
            const char *fmt_str, *msg;
            void *kernel_params[] = {
                (void *)&PyGpuArray_DIMS(%(x)s)[0],
                (void *)&PyGpuArray_DIMS(%(x)s)[1],
                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
                (void *)&stride_X0, (void *)&stride_X1,
                (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
                (void *)&stride_B0,
                (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
                (void *)&stride_Z0, (void *)&stride_Z1};
            int err = GA_NO_ERROR;
            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
            {
              if(shmem_sz < (32 * 1024 - 500)){
                err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
                                     n_blocks, threads_per_block, shmem_sz,
                                     kernel_params);
                fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
                msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
              }else{
                err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
                                     3, n_blocks, threads_per_block,
                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
                                     kernel_params);
                fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
                msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
              }
              %(err_check)s
            }
        }
        assert(%(z)s);
        """ % locals())
Exemple #3
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        work_x = gpuarray.dtype_to_ctype(work_x)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        if node.inputs[0].type.context.kind != b"cuda":
            f = ""
        else:
            f = "" if dtype_x == "float64" else "f"
        params = [
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
        ]
        sio = StringIO()
        print(
            """#include "cluda.h"

        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
            GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
            GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
            GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
            GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
            GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
        {
          x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
          b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
          y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
          nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
          sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
          am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
          for (ga_int row = GID_0; row < M; row += GDIM_0){
            GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
            GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
            GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
            LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
            LOCAL_MEM ga_int row_max_threadIdx;
            %(work_x)s per_thread_row_max, per_thread_sum;
            ga_int per_thread_row_max_j;
            // COMPUTE ROW MAX AND ARGMAX
            // compute separate per-thread maximums and argmaxes
            per_thread_row_max = NAN;
            per_thread_row_max_j = 0;
            for (ga_int j = LID_0; j < N; j += LDIM_0)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
              per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
            }
            per_thread_values[LID_0] = per_thread_row_max;
            local_barrier();
            if (LID_0 == 0) {
              row_max = NAN;
              row_max_threadIdx = 0;
              for (ga_int j = 0; j < LDIM_0; j++)
              {
                %(work_x)s per_thread_max = per_thread_values[j];
                row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
                row_max = fmax%(f)s(per_thread_max, row_max);
              }
            }
            local_barrier();
            // The thread with the highest max writes out which of its
            // values was the winner.
            if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
            // COMPUTE SOFTMAX
            per_thread_sum = 0.0;
            for (ga_int j = LID_0; j < N; j += LDIM_0)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
              per_thread_sum += sm_ij;
              sm[j * sms1] = %(write_x)s(sm_ij);
            }
            per_thread_values[LID_0] = per_thread_sum;
            local_barrier();
            if (LID_0 == 0) {
              sum = 0.0;
              for (ga_int j = 0; j < LDIM_0; j++) {
                sum += per_thread_values[j];
              }
              sum_inv = 1.0 / sum;
            }
            local_barrier();
            for (ga_int j = LID_0; j < N; j += LDIM_0) {
              sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
            }
            if (LID_0 == 0) {
              const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0];
              if ((y_idx >= N || y_idx < 0)) {
                // raise some suspicion.
                nll_data[row * nlls0] = %(write_x)s(0.0);
              } else {
                nll_data[row * nlls0] = %(write_x)s(
                   - %(load_x)s(x[y_idx * xs1])
                   - %(load_b)s(b[y_idx * bs0])
                   + row_max + log%(f)s(sum));
              }
            }
          }
        }
        """ % locals(),
            file=sio,
        )

        return [
            Kernel(
                code=sio.getvalue(),
                name=kname,
                params=params,
                flags=flags,
                objvar=k_var,
            )
        ]
Exemple #4
0
    def gpu_kernels(self, node, nodename):
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
        work_dnll = work_dtype(dtype_dnll)
        load_dnll = load_w(dtype_dnll)
        load_sm = load_w(dtype_sm)
        write_dx = write_w(dtype_dx)
        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
        wtype_dnll = gpuarray.dtype_to_ctype(work_dnll)
        type_dnll = gpuarray.dtype_to_ctype(dtype_dnll)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
        kname = "kCrossEntropySoftmax1HotWithBiasDx"
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        params = [
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
        ]
        sio = StringIO()
        print(
            """#include "cluda.h"

        KERNEL void %(kname)s(
           const ga_size N, const ga_size K,
           GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
           GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1,
           GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0,
           GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1)
        {
            dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll);
            sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx);
            dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx);
            for (ga_int i = GID_0; i < N; i += GDIM_0)
            {
                %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];
                for (ga_int j = LID_0; j < K; j += LDIM_0)
                {
                    if (y_i == j)
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
                    }
                    else
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              %(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
                    }
                }
            }
        }
        """ % locals(),
            file=sio,
        )
        return [
            Kernel(
                code=sio.getvalue(),
                name=kname,
                params=params,
                flags=flags,
                objvar=k_var,
            )
        ]
Exemple #5
0
 def c_code(self, node, nodename, inp, out, sub):
     itemsize_x = np.dtype(node.inputs[0].dtype).itemsize
     worksize_x = np.dtype(work_dtype(node.inputs[0].dtype)).itemsize
     itemsize_b = np.dtype(node.inputs[1].dtype).itemsize
     itemsize_y_idx = np.dtype(node.inputs[2].dtype).itemsize
     itemsize_nll = np.dtype(node.outputs[0].dtype).itemsize
     itemsize_sm = np.dtype(node.outputs[1].dtype).itemsize
     itemsize_am = np.dtype(node.outputs[2].dtype).itemsize
     x, b, y_idx = inp
     nll, sm, am = out
     fail = sub["fail"]
     ctx = sub["params"]
     k_var = f"k_xent_sm_1hot_bias_{nodename}"
     err_check = ("""
         if (err != GA_NO_ERROR) {
             PyErr_Format(PyExc_RuntimeError,
                          "gpuarray error: %(k_var)s: %%s.",
                          GpuKernel_error(&%(k_var)s, err));
             %(fail)s;
         }
     """ % locals())
     sio = StringIO()
     print(
         """
     if (PyGpuArray_DIMS(%(x)s)[0] !=
         PyGpuArray_DIMS(%(y_idx)s)[0])
     {
         PyErr_SetString(PyExc_ValueError,
                         "dimension mismatch in x,y_idx arguments");
         %(fail)s;
     }
     if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0])
     {
         PyErr_SetString(PyExc_ValueError,
                         "dimension mismatch in x,b arguments");
         %(fail)s;
     }
     if (aesara_prep_output(&%(nll)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
     if (aesara_prep_output(&%(sm)s, 2, PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
     if (aesara_prep_output(&%(am)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(y_idx)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
     {
         size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
         size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
         size_t n_shared = n_threads * %(worksize_x)s;
  //TODO: launch more threads per row and do parallel sum and max reductions
         int err = k_xent_sm_1hot_bias_call(
             1, &n_blocks, &n_threads, n_shared,
             PyGpuArray_DIMS(%(x)s)[0],
             PyGpuArray_DIMS(%(x)s)[1],
             %(x)s->ga.data, %(x)s->ga.offset,
             PyGpuArray_STRIDE(%(x)s, 0) / %(itemsize_x)s,
             PyGpuArray_STRIDE(%(x)s, 1) / %(itemsize_x)s,
             %(b)s->ga.data, %(b)s->ga.offset,
             PyGpuArray_STRIDE(%(b)s, 0) / %(itemsize_b)s,
             %(y_idx)s->ga.data, %(y_idx)s->ga.offset,
             PyGpuArray_STRIDE(%(y_idx)s, 0) / %(itemsize_y_idx)s,
             %(nll)s->ga.data, %(nll)s->ga.offset,
             PyGpuArray_STRIDE(%(nll)s, 0) / %(itemsize_nll)s,
             %(sm)s->ga.data, %(sm)s->ga.offset,
             PyGpuArray_STRIDE(%(sm)s, 0) / %(itemsize_sm)s,
             PyGpuArray_STRIDE(%(sm)s, 1) / %(itemsize_sm)s,
             %(am)s->ga.data, %(am)s->ga.offset,
             PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
         %(err_check)s
     }
     """ % locals(),
         file=sio,
     )
     return sio.getvalue()
Exemple #6
0
    def gpu_kernels(self, node, name):
        out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype)
        pvals_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
        unis_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[1].dtype)
        work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
        write_out_ctype = write_w(node.outputs[0].dtype)
        load_in_ctype = load_w(node.inputs[0].dtype)
        code = """#include "cluda.h"

KERNEL void k_multi_warp_multinomial(
    const ga_size nb_multi,
    const ga_size nb_outcomes,
    GLOBAL_MEM %(pvals_ctype)s *global_pvals,
    const ga_size global_pvals_offset,
    const ga_ssize pvals_row_stride,
    const ga_ssize pvals_col_stride,
    GLOBAL_MEM %(unis_ctype)s *global_unis,
    const ga_size global_unis_offset,
    const ga_ssize unis_stride,
    GLOBAL_MEM %(out_ctype)s *global_outs,
    const ga_size global_outs_offset,
    const ga_ssize outs_row_stride,
    const ga_ssize outs_col_stride
)
{
    global_pvals = (GLOBAL_MEM %(pvals_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset);
    global_unis = (GLOBAL_MEM %(unis_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
    global_outs = (GLOBAL_MEM %(out_ctype)s *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset);
    // each thread takes care of one multinomial draw
    int n = LDIM_0*GID_0 + LID_0;
    if (n < nb_multi)
    {
        %(work_ctype)s cummul = 0.;
        bool done = false;
        const %(work_ctype)s unis_n = %(load_in_ctype)s(global_unis[n*unis_stride]);
        for (ga_size m = 0; m < nb_outcomes; ++m)
        {
            %(work_ctype)s current_out = 0;
            if (!done)
            {
                cummul += %(load_in_ctype)s(global_pvals[m * pvals_col_stride + n * pvals_row_stride]);
                if (unis_n < cummul)
                {
                    current_out = 1;
                    done = true;
                }
            }
            //write out transposed for speed.
            global_outs[n * outs_col_stride +
                        m * outs_row_stride] = %(write_out_ctype)s(current_out);
        }
    }
}
""" % dict(
            out_ctype=out_ctype,
            write_out_ctype=write_out_ctype,
            work_ctype=work_ctype,
            pvals_ctype=pvals_ctype,
            unis_ctype=unis_ctype,
            load_in_ctype=load_in_ctype,
        )
        return [
            Kernel(
                code=code,
                name="k_multi_warp_multinomial",
                params=[
                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.GpuArray,
                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.GpuArray,
                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.GpuArray,
                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.SSIZE,
                ],
                flags=Kernel.get_flags(node.outputs[0].dtype),
                objvar="k_multi_warp_multinomial_" + name,
            )
        ]