Example #1
0
 def c_support_code_apply(self, node, nodename):
     ret1 = nvcc_kernel(
         "kSoftmaxWithBias_%s" % nodename,
         params=['int M', 'int N',
                 'const float * x', 'const int sx0', 'const int sx1',
                 'const float * b', 'const int sb0',
                 'float * sm', 'const int sm_s0', 'const int sm_s1'],
         body=[
                 "extern __shared__ float buf[]",
                 "float * buf2 = buf + N",
                 "for (int blockIDX = blockIdx.x; blockIDX < M;"
                 "     blockIDX += gridDim.x){",
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                      "buf[tx] += b[tx * sb0]",
                      "buf2[tx] = buf[tx]",
                   "}",
                    "__syncthreads()",
                    inline_softmax('N', 'buf', 'buf2',
                                   'threadIdx.x', 'blockDim.x'),
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
                   "}",
                   "__syncthreads()",
                 "}",
         ])
     ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                        params=['int M', 'int N',
                                'const float * x',
                                'const int sx0', 'const int sx1',
                                'const float * b', 'const int sb0',
                                'float * sm',
                                'const int sm_s0', 'const int sm_s1'],
                        body=[
                            "extern __shared__ float buf[]",
                            "for (int blockIDX = blockIdx.x; blockIDX < M;"
                            "     blockIDX += gridDim.x){",
                            "const float *x_ptr = &x[blockIDX * sx0]",
                            "float *sm_ptr = &sm[blockIDX * sm_s0]",
                            inline_softmax_fixed_shared('N', 'buf',
                                                        'x_ptr', 'sx1',
                                                        'sm_ptr', 'sm_s1',
                                                        'threadIdx.x',
                                                        'blockDim.x',
                                                        'b', 'sb0'),
                            "__syncthreads()",
                            "}",
                        ])
     return ret1 + "\n" + ret2
Example #2
0
File: nnet.py Project: yubow/Theano
 def c_support_code_apply(self, node, nodename):
     ret1 = nvcc_kernel(
         "kSoftmaxWithBias_%s" % nodename,
         params=['int M', 'int N',
                 'const float * x', 'const int sx0', 'const int sx1',
                 'const float * b', 'const int sb0',
                 'float * sm', 'const int sm_s0', 'const int sm_s1'],
         body=[
                 "extern __shared__ float buf[]",
                 "float * buf2 = buf + N",
                 "for (int blockIDX = blockIdx.x; blockIDX < M;"
                 "     blockIDX += gridDim.x){",
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                      "buf[tx] += b[tx * sb0]",
                      "buf2[tx] = buf[tx]",
                   "}",
                    "__syncthreads()",
                    inline_softmax('N', 'buf', 'buf2',
                                   'threadIdx.x', 'blockDim.x'),
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
                   "}",
                   "__syncthreads()",
                 "}",
         ])
     ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                        params=['int M', 'int N',
                                'const float * x',
                                'const int sx0', 'const int sx1',
                                'const float * b', 'const int sb0',
                                'float * sm',
                                'const int sm_s0', 'const int sm_s1'],
                        body=[
                            "extern __shared__ float buf[]",
                            "for (int blockIDX = blockIdx.x; blockIDX < M;"
                            "     blockIDX += gridDim.x){",
                            "const float *x_ptr = &x[blockIDX * sx0]",
                            "float *sm_ptr = &sm[blockIDX * sm_s0]",
                            inline_softmax_fixed_shared('N', 'buf',
                                                        'x_ptr', 'sx1',
                                                        'sm_ptr', 'sm_s1',
                                                        'threadIdx.x',
                                                        'blockDim.x',
                                                        'b', 'sb0'),
                            "__syncthreads()",
                            "}",
                        ])
     return ret1 + "\n" + ret2
Example #3
0
 def c_support_code_apply(self, node, nodename):
     return nvcc_kernel(
         "kSoftmaxWithBias_%s" % nodename,
         params=[
             "int M",
             "int N",
             "const float * x",
             "const int sx0",
             "const int sx1",
             "const float * b",
             "const int sb0",
             "float * sm",
         ],
         body=[
             "extern __shared__ float buf[]",
             "float * buf2 = buf + N",
             "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
             "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
             "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
             "buf[tx] += b[tx * sb0]",
             "buf2[tx] = buf[tx]",
             "}",
             "__syncthreads()",
             inline_softmax("N", "buf", "buf2", "threadIdx.x", "blockDim.x"),
             "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
             "sm[blockIDX * N + tx] = buf[tx]",
             "}",
             "__syncthreads()",
             "}",
         ],
     )
Example #4
0
 def c_support_code_apply(self, node, nodename):
     return nvcc_kernel(
         "kSoftmaxWithBias_%s" % nodename,
         params=[
             'int M', 'int N', 'const float * x', 'const int sx0',
             'const int sx1', 'const float * b', 'const int sb0',
             'float * sm', 'const int ssm0', 'const int ssm1'
         ],
         body=[
             "extern __shared__ float buf[]",
             "float * buf2 = buf + N",
             "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
             "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
             "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
             "buf[tx] += b[tx * sb0]",
             "buf2[tx] = buf[tx]",
             "}",
             "__syncthreads()",
             inline_softmax('N', 'buf', 'buf2', 'threadIdx.x',
                            'blockDim.x'),
             "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
             "sm[blockIDX * ssm0 + tx * ssm1] = buf[tx]",
             "}",
             "__syncthreads()",
             "}",
         ])
Example #5
0
File: nnet.py Project: vlb/Theano
 def c_support_code_apply(self, node, nodename):
     return nvcc_kernel("kSoftmax_%s"%nodename,
             params=['int M', 'int N',
                 'const float * x', 'const int sx0', 'const int sx1',
                 'float * sm'],
             body=[
                 "extern __shared__ float buf[]",
                 "float * buf2 = buf + N",
                 "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                     "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                     "buf2[tx] = buf[tx]",
                   "}",
                   "__syncthreads()",
                   inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                     "sm[blockIDX * N + tx] = buf[tx]",# This set all value correctly
                   "}",
                   "__syncthreads()",
                 "}",
                 ])