Beispiel #1
0
 def c_support_code_apply(self, node, nodename):
     ret1 = nvcc_kernel(
         "kSoftmaxWithBias_%s" % nodename,
         params=['int M', 'int N',
                 'const float * x', 'const int sx0', 'const int sx1',
                 'const float * b', 'const int sb0',
                 'float * sm', 'const int sm_s0', 'const int sm_s1'],
         body=[
                 "extern __shared__ float buf[]",
                 "float * buf2 = buf + N",
                 "for (int blockIDX = blockIdx.x; blockIDX < M;"
                 "     blockIDX += gridDim.x){",
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                      "buf[tx] += b[tx * sb0]",
                      "buf2[tx] = buf[tx]",
                   "}",
                    "__syncthreads()",
                    inline_softmax('N', 'buf', 'buf2',
                                   'threadIdx.x', 'blockDim.x'),
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
                   "}",
                   "__syncthreads()",
                 "}",
         ])
     ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                        params=['int M', 'int N',
                                'const float * x',
                                'const int sx0', 'const int sx1',
                                'const float * b', 'const int sb0',
                                'float * sm',
                                'const int sm_s0', 'const int sm_s1'],
                        body=[
                            "extern __shared__ float buf[]",
                            "for (int blockIDX = blockIdx.x; blockIDX < M;"
                            "     blockIDX += gridDim.x){",
                            "const float *x_ptr = &x[blockIDX * sx0]",
                            "float *sm_ptr = &sm[blockIDX * sm_s0]",
                            inline_softmax_fixed_shared('N', 'buf',
                                                        'x_ptr', 'sx1',
                                                        'sm_ptr', 'sm_s1',
                                                        'threadIdx.x',
                                                        'blockDim.x',
                                                        'b', 'sb0'),
                            "__syncthreads()",
                            "}",
                        ])
     return ret1 + "\n" + ret2
Beispiel #2
0
 def c_support_code_apply(self, node, nodename):
     ret1 = nvcc_kernel(
         "kSoftmaxWithBias_%s" % nodename,
         params=['int M', 'int N',
                 'const float * x', 'const int sx0', 'const int sx1',
                 'const float * b', 'const int sb0',
                 'float * sm', 'const int sm_s0', 'const int sm_s1'],
         body=[
                 "extern __shared__ float buf[]",
                 "float * buf2 = buf + N",
                 "for (int blockIDX = blockIdx.x; blockIDX < M;"
                 "     blockIDX += gridDim.x){",
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                      "buf[tx] += b[tx * sb0]",
                      "buf2[tx] = buf[tx]",
                   "}",
                    "__syncthreads()",
                    inline_softmax('N', 'buf', 'buf2',
                                   'threadIdx.x', 'blockDim.x'),
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
                   "}",
                   "__syncthreads()",
                 "}",
         ])
     ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                        params=['int M', 'int N',
                                'const float * x',
                                'const int sx0', 'const int sx1',
                                'const float * b', 'const int sb0',
                                'float * sm',
                                'const int sm_s0', 'const int sm_s1'],
                        body=[
                            "extern __shared__ float buf[]",
                            "for (int blockIDX = blockIdx.x; blockIDX < M;"
                            "     blockIDX += gridDim.x){",
                            "const float *x_ptr = &x[blockIDX * sx0]",
                            "float *sm_ptr = &sm[blockIDX * sm_s0]",
                            inline_softmax_fixed_shared('N', 'buf',
                                                        'x_ptr', 'sx1',
                                                        'sm_ptr', 'sm_s1',
                                                        'threadIdx.x',
                                                        'blockDim.x',
                                                        'b', 'sb0'),
                            "__syncthreads()",
                            "}",
                        ])
     return ret1 + "\n" + ret2